-## Ignore Visual Studio temporary files, build results, and
-## files generated by popular Visual Studio add-ons.
-
-# User-specific files
-*.suo
-*.user
-*.userosscache
-*.sln.docstates
-
-# User-specific files (MonoDevelop/Xamarin Studio)
-*.userprefs
-
-# Build results
-[Dd]ebug/
-[Dd]ebugPublic/
-[Rr]elease/
-[Rr]eleases/
-[Xx]64/
-[Xx]86/
-[Bb]uild/
-bld/
-[Bb]in/
-[Oo]bj/
-
-# PY.TEST
-*.pyc
-tests/integration/report.html
-tests/integration/report.xml
-tests/integration/assets/
-tests/integration/__pycache__/
-
-# Visual Studio 2015 cache/options directory
-.vs/
-# Uncomment if you have tasks that create the project's static files in wwwroot
-#wwwroot/
-
-# MSTest test Results
-[Tt]est[Rr]esult*/
-[Bb]uild[Ll]og.*
-
-# NUNIT
-*.VisualState.xml
-TestResult.xml
-
-# Build Results of an ATL Project
-[Dd]ebugPS/
-[Rr]eleasePS/
-dlldata.c
-
-# DNX
-project.lock.json
-artifacts/
-
-*_i.c
-*_p.c
-*_i.h
-*.ilk
-*.meta
-*.obj
-*.pch
-*.pdb
-*.pgc
-*.pgd
-*.rsp
-*.sbr
-*.tlb
-*.tli
-*.tlh
-*.tmp
-*.tmp_proj
-*.log
-*.vspscc
-*.vssscc
-.builds
-*.pidb
-*.svclog
-*.scc
-
-# Chutzpah Test files
-_Chutzpah*
-
-# Visual C++ cache files
-ipch/
-*.aps
-*.ncb
-*.opendb
-*.opensdf
-*.sdf
-*.cachefile
-*.VC.db
-
-# Visual Studio profiler
-*.psess
-*.vsp
-*.vspx
-*.sap
-
-# TFS 2012 Local Workspace
-$tf/
-
-# Guidance Automation Toolkit
-*.gpState
-
-# ReSharper is a .NET coding add-in
-_ReSharper*/
-*.[Rr]e[Ss]harper
-*.DotSettings.user
-
-# JustCode is a .NET coding add-in
-.JustCode
-
-# TeamCity is a build add-in
-_TeamCity*
-
-# DotCover is a Code Coverage Tool
-*.dotCover
-
-# NCrunch
-_NCrunch_*
-.*crunch*.local.xml
-nCrunchTemp_*
-
-# MightyMoose
-*.mm.*
-AutoTest.Net/
-
-# Web workbench (sass)
-.sass-cache/
-
-# Installshield output folder
-[Ee]xpress/
-
-# DocProject is a documentation generator add-in
-DocProject/buildhelp/
-DocProject/Help/*.HxT
-DocProject/Help/*.HxC
-DocProject/Help/*.hhc
-DocProject/Help/*.hhk
-DocProject/Help/*.hhp
-DocProject/Help/Html2
-DocProject/Help/html
-
-# Click-Once directory
-publish/
-
-# Publish Web Output
-*.[Pp]ublish.xml
-*.azurePubxml
-
-# TODO: Un-comment the next line if you do not want to checkin
-# your web deploy settings because they may include unencrypted
-# passwords
-#*.pubxml
-*.publishproj
-
-# NuGet Packages
-*.nupkg
-# The packages folder can be ignored because of Package Restore
-**/packages/*
-# except build/, which is used as an MSBuild target.
-!**/packages/build/
-# Uncomment if necessary however generally it will be regenerated when needed
-#!**/packages/repositories.config
-# NuGet v3's project.json files produces more ignoreable files
-*.nuget.props
-*.nuget.targets
-
-# Microsoft Azure Build Output
-csx/
-*.build.csdef
-
-# Microsoft Azure Emulator
-ecf/
-rcf/
-
-# Microsoft Azure ApplicationInsights config file
-ApplicationInsights.config
-
-# Windows Store app package directory
-AppPackages/
-BundleArtifacts/
-
-# Visual Studio cache files
-# files ending in .cache can be ignored
-*.[Cc]ache
-# but keep track of directories ending in .cache
-!*.[Cc]ache/
-
-# Others
-ClientBin/
-[Ss]tyle[Cc]op.*
-~$*
-*~
-*.dbmdl
-*.dbproj.schemaview
-*.pfx
-*.publishsettings
-node_modules/
-orleans.codegen.cs
-
-# RIA/Silverlight projects
-Generated_Code/
-
-# Backup & report files from converting an old project file
-# to a newer Visual Studio version. Backup files are not needed,
-# because we have git ;-)
-_UpgradeReport_Files/
-Backup*/
-UpgradeLog*.XML
-UpgradeLog*.htm
-
-# SQL Server files
-*.mdf
-*.ldf
-
-# Business Intelligence projects
-*.rdl.data
-*.bim.layout
-*.bim_*.settings
-
-# Microsoft Fakes
-FakesAssemblies/
-
-# GhostDoc plugin setting file
-*.GhostDoc.xml
-
-# Target VS files:
-vsx64
-
-# Node.js Tools for Visual Studio
-.ntvs_analysis.dat
-
-# Visual Studio 6 build log
-*.plg
-
-# Visual Studio 6 workspace options file
-*.opt
-
-# Visual Studio LightSwitch build output
-**/*.HTMLClient/GeneratedArtifacts
-**/*.DesktopClient/GeneratedArtifacts
-**/*.DesktopClient/ModelManifest.xml
-**/*.Server/GeneratedArtifacts
-**/*.Server/ModelManifest.xml
-_Pvt_Extensions
-
-# LightSwitch generated files
-GeneratedArtifacts/
-ModelManifest.xml
-
-# Paket dependency manager
-.paket/paket.exe
-
-# FAKE - F# Make
-.fake/
-*.filters
-/External
-/Output
-/InferenceEngineMain/models
-/Test
-/HTTPClient/*.a
-/InferenceEngineMain/newModels
+# build/artifact dirs
+_*
+# but ensure we don't skip __init__.py
+!__init__.py
+# developer tools
+.idea
+.vscode
+cmake-build-debug
+cmake-build-release
.DS_Store
-
-# For IDEA
-.idea/
-VS/
-Xcode/
-temp/
-report/
-.kdev4/
-*.kdev4
-*.kate-swp
-
-/lin-build
-/win-build
-/CMakeFiles
-*.stamp
-*.depend
-*.vcxproj
-*.sln
-/CMakeCache.txt
-.vimprj/
-build_IA32/
-.dir-locals.el
-GTAGS
-GPATH
-GRTAGS
-GSYMS
+**/tags
compile_commands.json
-service/dot-net-service/Output
-**/sublime_build
-/.project
-.vscode/
-/vsx32
-/service/dot-net-service/.klocwork/DotNetService
-cmake-build-*/
-/lin64
-
-.gdb_history
+bin/
+build/
.local_vimrc
-.ycm_extra_conf.py
-tags
-
-
-# from Model Optimizer repo
-.idea
-.project
-.cproject
-.pydevproject
-.settings
-/bin/
-/gen/
-__pycache__
-*.swp
-/config.xml
-
-# Python-specific
-.env3
-*.pyc
-
-# Tests-specific
-.coverage
-htmlcov
-pylint_report.txt
-pylint_report_comments.txt
-
-# Documentation-generated
-docs/build
-docs/source/_static
-docs/source/_templates
-docs/source/generated/
-
-# Artifacts
-/*.bin
-/*.xml
-/*.json
-/*.so
-/*.txt
-/*.mapping
-/*.dat
-/*.svg
+.gdb_history
+.vimspector.json
+doc/
+docs/build_documentation/work_dir/
+inference-engine/plugins/
+.repo/
+docs/template_plugin/html/
+CMakeLists.txt.user
+docs/IE_PLUGIN_DG/html/
[submodule "ngraph"]
path = ngraph
url = https://github.com/NervanaSystems/ngraph.git
- ignore = dirty
+ ignore = dirty
\ No newline at end of file
if (NOT ANDROID)
ngraph_set(NGRAPH_UNIT_TEST_ENABLE TRUE)
- ngraph_set(NGRAPH_UNIT_TEST_OPENVINO_ENABLE TRUE)
+ ngraph_set(NGRAPH_IE_ENABLE TRUE)
# ngraph_set(NGRAPH_ONNX_IMPORT_ENABLE TRUE)
set(NGRAPH_ONNX_IMPORT_ENABLE TRUE CACHE BOOL "" FORCE)
else()
ngraph_set(NGRAPH_UNIT_TEST_ENABLE FALSE)
ngraph_set(NGRAPH_TEST_UTIL_ENABLE FALSE)
- ngraph_set(NGRAPH_UNIT_TEST_OPENVINO_ENABLE FALSE)
+ ngraph_set(NGRAPH_IE_ENABLE FALSE)
ngraph_set(NGRAPH_ONNX_IMPORT_ENABLE FALSE)
endif()
--- /dev/null
+#!groovy
+
+dldtPipelineEntrypoint(this)
if(WIN32)
set(IE_CPACK_LIBRARY_PATH ${IE_CPACK_IE_DIR}/lib/${CMAKE_BUILD_TYPE}/${ARCH} PARENT_SCOPE)
+ set(IE_CPACK_RUNTIME_PATH ${IE_CPACK_IE_DIR}/bin/${CMAKE_BUILD_TYPE}/${ARCH} PARENT_SCOPE)
+ set(IE_CPACK_ARCHIVE_PATH ${IE_CPACK_IE_DIR}/lib/${CMAKE_BUILD_TYPE}/${ARCH} PARENT_SCOPE)
else()
set(IE_CPACK_LIBRARY_PATH ${IE_CPACK_IE_DIR}/lib/${ARCH} PARENT_SCOPE)
+ set(IE_CPACK_RUNTIME_PATH ${IE_CPACK_IE_DIR}/lib/${ARCH} PARENT_SCOPE)
+ set(IE_CPACK_ARCHIVE_PATH ${IE_CPACK_IE_DIR}/lib/${ARCH} PARENT_SCOPE)
endif()
endfunction()
set(CPACK_GENERATOR "TGZ")
if(WIN32)
set(CPACK_PACKAGE_NAME inference-engine_${CMAKE_BUILD_TYPE})
+ string(REPLACE "\\" "_" CPACK_PACKAGE_VERSION "${CI_BUILD_NUMBER}")
else()
set(CPACK_PACKAGE_NAME inference-engine)
+ string(REPLACE "/" "_" CPACK_PACKAGE_VERSION "${CI_BUILD_NUMBER}")
endif()
set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF)
set(CPACK_ARCHIVE_COMPONENT_INSTALL ON)
if (ARG_ADD_CPPLINT)
# code style
add_cpplint_target(${ARG_NAME}_cpplint FOR_TARGETS ${ARG_NAME})
- add_clang_format_target(${ARG_NAME}_clang_format FOR_TARGETS ${ARG_NAME})
endif()
if (ARG_DEVELOPER_PACKAGE)
# developer package
set(multiValueArgs "FOR_TARGETS" "FOR_SOURCES" "EXCLUDE_PATTERNS")
cmake_parse_arguments(CLANG_FORMAT "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
- if(CLANG_FORMAT_ALL)
- set(all ALL)
- endif()
-
foreach(target IN LISTS CLANG_FORMAT_FOR_TARGETS)
get_target_property(target_sources "${target}" SOURCES)
list(APPEND CLANG_FORMAT_FOR_SOURCES ${target_sources})
"All clang-format output files")
add_custom_target(${TARGET_NAME}
- ${all}
DEPENDS ${all_output_files}
COMMENT "[clang-format] ${TARGET_NAME}")
cmake_policy(SET CMP0054 NEW)
+include(models)
+
#we have number of dependencies stored on ftp
include(dependency_solver)
include(ExternalProject)
+if (ENABLE_SAME_BRANCH_FOR_MODELS)
+ branchName(MODELS_BRANCH)
+else()
+ set(MODELS_BRANCH "master")
+endif()
+
+
+if (ENABLE_DATA)
+ add_models_repo(${ENABLE_DATA} "data:inference-engine/open-source-data.git")
+ set(MODELS_PATH "${TEMP}/data/src/data")
+ set(DATA_PATH "${MODELS_PATH}")
+endif()
+
+message(STATUS "MODELS_PATH=" ${MODELS_PATH})
+
+fetch_models_and_validation_set()
+
include(linux_name)
if(COMMAND get_linux_name)
get_linux_name(LINUX_OS_NAME)
set(ie_options "@IE_OPTIONS@;CMAKE_BUILD_TYPE;CMAKE_SKIP_RPATH")
-load_cache("${cache_path}" READ_WITH_PREFIX "" ${ie_options})
+foreach(option IN LISTS ie_options)
+ if(NOT DEFINED "${option}")
+ load_cache("${cache_path}" READ_WITH_PREFIX "" ${option})
+ endif()
+endforeach()
message(STATUS "The following CMake options are exported from Inference Engine Developer package")
message("")
ie_dependent_option (ENABLE_MYRIAD_MVNC_TESTS "functional and behavior tests for mvnc api" OFF "ENABLE_TESTS;ENABLE_MYRIAD" OFF)
-ie_dependent_option (ENABLE_SAMPLES "console samples are part of inference engine package" ON "NOT MINGW" OFF)
+ie_dependent_option (ENABLE_DATA "fetch models from open-source-data repo" ON "ENABLE_FUNCTIONAL_TESTS;NOT ANDROID" OFF)
+
+ie_dependent_option (ENABLE_SAME_BRANCH_FOR_MODELS "uses same branch for models and for inference engine, if not enabled models are taken from master" OFF "ENABLE_TESTS" OFF)
ie_dependent_option (ENABLE_BEH_TESTS "tests oriented to check inference engine API corecteness" ON "ENABLE_TESTS" OFF)
--- /dev/null
+# Copyright (C) 2018-2020 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+if(ENABLE_DOCKER)
+ cmake_minimum_required(VERSION 3.3 FATAL_ERROR)
+else()
+ cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
+endif()
+
+cmake_policy(SET CMP0054 NEW)
+
+find_package(Git REQUIRED)
+
+set(MODELS_LST "")
+set(MODELS_LST_TO_FETCH "")
+
+function (add_models_repo add_to_fetcher model_name)
+ list(LENGTH ARGV add_models_args)
+ if (add_models_args EQUAL 3)
+ list(GET ARGV 2 branch_name)
+ else()
+ set(branch_name ${MODELS_BRANCH})
+ endif()
+ if (add_to_fetcher)
+ set(model_name "${model_name}:${branch_name}")
+ list(APPEND MODELS_LST_TO_FETCH ${model_name})
+ endif()
+
+ list(APPEND MODELS_LST ${model_name})
+
+ set(MODELS_LST_TO_FETCH ${MODELS_LST_TO_FETCH} PARENT_SCOPE)
+ set(MODELS_LST ${MODELS_LST} PARENT_SCOPE)
+endfunction()
+
+function(add_lfs_repo name prefix url tag)
+ ExternalProject_Add(${name}
+ PREFIX ${prefix}
+ GIT_REPOSITORY ${url}
+ GIT_TAG ${tag}
+ GIT_CONFIG "http.sslverify=false"
+ GIT_PROGRESS 1
+ CONFIGURE_COMMAND ""
+ BUILD_COMMAND ""
+ INSTALL_COMMAND ""
+ LOG_DOWNLOAD ON)
+
+ execute_process(
+ COMMAND ${GIT_EXECUTABLE} lfs install --local --force
+ WORKING_DIRECTORY ${prefix}/src/${name}
+ OUTPUT_VARIABLE lfs_output
+ RESULT_VARIABLE lfs_var)
+ if(lfs_var)
+ message(FATAL_ERROR [=[
+ Failed to setup Git LFS: ${lfs_output}
+ Git lfs must be installed in order to fetch models
+ Please install it from https://git-lfs.github.com/
+ ]=])
+ endif()
+endfunction()
+
+function (fetch_models_and_validation_set)
+ foreach(loop_var ${MODELS_LST_TO_FETCH})
+ string(REPLACE ":" ";" MODEL_CONFIG_LST ${loop_var})
+
+ list(GET MODEL_CONFIG_LST 0 folder_name)
+ list(GET MODEL_CONFIG_LST 1 repo_name)
+ list(GET MODEL_CONFIG_LST 2 branch_name)
+
+ string(FIND ${folder_name} "model" IS_MODEL)
+ if(${folder_name} MATCHES "model*")
+ set(FOLDER_NAME "/models/src")
+ endif()
+ add_lfs_repo(
+ "${folder_name}"
+ ${TEMP}${FOLDER_NAME}/${folder_name}
+ "git@gitlab-icv.inn.intel.com:${repo_name}"
+ "${branch_name}")
+ endforeach(loop_var)
+endfunction()
ie_cpack_add_component(${install_component} REQUIRED DEPENDS core)
install(TARGETS ${IE_PLUGIN_NAME}
- RUNTIME DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT ${install_component}
- ARCHIVE DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT ${install_component}
+ RUNTIME DESTINATION ${IE_CPACK_RUNTIME_PATH} COMPONENT ${install_component}
+ ARCHIVE DESTINATION ${IE_CPACK_ARCHIVE_PATH} COMPONENT ${install_component}
LIBRARY DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT ${install_component})
endif()
endfunction()
set(LIBUSB_LIBRARY "${LIBUSB}/libs/${ANDROID_ABI}/libusb1.0.so")
log_rpath_from_dir(LIBUSB "${LIBUSB}/libs/${ANDROID_ABI}")
-endif()
\ No newline at end of file
+endif()
// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
+// SPDX-License-Identifier : Apache-2.0
//
#include <stdlib.h>
// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
+// SPDX-License-Identifier : Apache-2.0
//
#include <stdlib.h>
# install
install(TARGETS ${TARGET_NAME}
- RUNTIME DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
- ARCHIVE DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
+ RUNTIME DESTINATION ${IE_CPACK_RUNTIME_PATH} COMPONENT core
+ ARCHIVE DESTINATION ${IE_CPACK_ARCHIVE_PATH} COMPONENT core
LIBRARY DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core)
install(DIRECTORY ${InferenceEngine_C_API_SOURCE_DIR}/include/
args = parser.add_argument_group("Options")
args.add_argument('-h', '--help', action='help', default=SUPPRESS, help='Show this help message and exit.')
args.add_argument("-m", "--model", help="Required. Path to an .xml file with a trained model.",
- required=True, type=str)
+ required=True, type=str)
args.add_argument("-i", "--input", help="Required. Path to image file.",
- required=True, type=str, nargs="+")
+ required=True, type=str, nargs="+")
args.add_argument("-l", "--cpu_extension",
- help="Optional. Required for CPU custom layers. Absolute path to a shared library with the kernels implementations.",
- type=str, default=None)
+ help="Optional. Required for CPU custom layers. "
+ "Absolute path to a shared library with the kernels implementations.",
+ type=str, default=None)
args.add_argument("-d", "--device",
- help="Optional. Specify the target device to infer on; CPU, GPU, FPGA or MYRIAD is acceptable. Sample will look for a suitable plugin for device specified (CPU by default)",
- default="CPU", type=str)
+ help="Optional. Specify the target device to infer on; "
+ "CPU, GPU, FPGA or MYRIAD is acceptable. "
+ "Sample will look for a suitable plugin for device specified (CPU by default)",
+ default="CPU", type=str)
args.add_argument("--labels", help="Optional. Labels mapping file", default=None, type=str)
args.add_argument("-nt", "--number_top", help="Optional. Number of top results", default=10, type=int)
# ------------- 2. Load Plugin for inference engine and extensions library if specified --------------
log.info("Device info:")
versions = ie.get_versions(args.device)
- print("{}{}".format(" "*8, args.device))
- print("{}MKLDNNPlugin version ......... {}.{}".format(" "*8, versions[args.device].major, versions[args.device].minor))
- print("{}Build ........... {}".format(" "*8, versions[args.device].build_number))
+ print("{}{}".format(" " * 8, args.device))
+ print("{}MKLDNNPlugin version ......... {}.{}".format(" " * 8, versions[args.device].major,
+ versions[args.device].minor))
+ print("{}Build ........... {}".format(" " * 8, versions[args.device].build_number))
if args.cpu_extension and "CPU" in args.device:
ie.add_extension(args.cpu_extension, "CPU")
# -----------------------------------------------------------------------------------------------------
# --------------------------- 3. Read and preprocess input --------------------------------------------
- input_blob = next(iter(net.inputs))
- n, c, h, w = net.inputs[input_blob].shape
+
+ print("inputs number: " + str(len(net.inputs.keys())))
+
+ for input_key in net.inputs:
+ print("input shape: " + str(net.inputs[input_key].shape))
+ print("input key: " + input_key)
+ if len(net.inputs[input_key].layout) == 4:
+ n, c, h, w = net.inputs[input_key].shape
+
images = np.ndarray(shape=(n, c, h, w))
images_hw = []
for i in range(n):
log.warning("Image {} is resized from {} to {}".format(args.input[i], image.shape[:-1], (h, w)))
image = image.transpose((2, 0, 1)) # Change data layout from HWC to CHW
images[i] = image
+
# -----------------------------------------------------------------------------------------------------
# --------------------------- 4. Configure input & output ---------------------------------------------
# --------------------------- Prepare input blobs -----------------------------------------------------
log.info("Preparing input blobs")
- assert (len(net.inputs.keys()) == 1 or len(net.inputs.keys()) == 2), "Sample supports topologies only with 1 or 2 inputs"
- input_blob = next(iter(net.inputs))
+ assert (len(net.inputs.keys()) == 1 or len(
+ net.inputs.keys()) == 2), "Sample supports topologies only with 1 or 2 inputs"
out_blob = next(iter(net.outputs))
input_name, input_info_name = "", ""
elif len(net.inputs[input_key].layout) == 2:
input_info_name = input_key
net.inputs[input_key].precision = 'FP32'
- if net.inputs[input_key].shape[1] != 3 and net.inputs[input_key].shape[1] != 6 or net.inputs[input_key].shape[0] != 1:
+ if net.inputs[input_key].shape[1] != 3 and net.inputs[input_key].shape[1] != 6 or \
+ net.inputs[input_key].shape[0] != 1:
log.error('Invalid input info. Should be 3 or 6 values length.')
+ data = {}
+ data[input_name] = images
+
+ if input_info_name != "":
+ infos = np.ndarray(shape=(n, c), dtype=float)
+ for i in range(n):
+ infos[i, 0] = h
+ infos[i, 1] = w
+ infos[i, 2] = 1.0
+ data[input_info_name] = infos
+
# --------------------------- Prepare output blobs ----------------------------------------------------
log.info('Preparing output blobs')
log.info("Loading model to the device")
exec_net = ie.load_network(network=net, device_name=args.device)
log.info("Creating infer request and starting inference")
- res = exec_net.infer(inputs={input_blob: images})
+ res = exec_net.infer(inputs=data)
# -----------------------------------------------------------------------------------------------------
# --------------------------- Read and postprocess output ---------------------------------------------
ymin = np.int(ih * proposal[4])
xmax = np.int(iw * proposal[5])
ymax = np.int(ih * proposal[6])
- print("[{},{}] element, prob = {:.6} ({},{})-({},{}) batch id : {}"\
- .format(number, label, confidence, xmin, ymin, xmax, ymax, imid), end="")
+ print("[{},{}] element, prob = {:.6} ({},{})-({},{}) batch id : {}" \
+ .format(number, label, confidence, xmin, ymin, xmax, ymax, imid), end="")
if proposal[2] > 0.5:
print(" WILL BE PRINTED!")
if not imid in boxes.keys():
# -----------------------------------------------------------------------------------------------------
log.info("Execution successful\n")
- log.info("This sample is an API example, for any performance measurements please use the dedicated benchmark_app tool")
+ log.info(
+ "This sample is an API example, for any performance measurements please use the dedicated benchmark_app tool")
if __name__ == '__main__':
COMMAND ${CMAKE_COMMAND} -E copy ${PYTHON_BRIDGE_SRC_ROOT}/src/openvino/__init__.py ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/../__init__.py
)
+# creates a folder in openvino directory and a symlink to benchmark
+# inside bin directory for developers for running python benchmark_app
+if(UNIX)
+ add_custom_command(TARGET ${TARGET_NAME}
+ POST_BUILD
+ COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/../tools
+)
+ file(COPY ${OpenVINO_MAIN_SOURCE_DIR}/tools/benchmark DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/../tools/)
+endif()
+
# install
install(TARGETS ${TARGET_NAME}
#
# Usage example:\n
# ```python
- # net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
# ie = IECore()
- # exec_net = ie.load_network(network=net, device_name="CPU", num_requsts=2)
+ # net = ie.read_network(model=path_to_xml_file, weights=path_to_bin_file)
+ # exec_net = ie.load_network(network=net, device_name="CPU", num_requests=2)
# ```
cpdef ExecutableNetwork load_network(self, IENetwork network, str device_name, config=None, int num_requests=1):
cdef ExecutableNetwork exec_net = ExecutableNetwork()
# @return An `ExecutableNetwork` object
# Usage example:\n
# ```python
- # net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
# ie = IECore()
+ # net = ie.read_network(model=path_to_xml_file, weights=path_to_bin_file)
# exec_net = ie.load_network(network=net, device_name="MYRIAD", num_requsts=2)
# # export executable network
# exec_net.export(path_to_file_to_save)
#
# Usage example:\n
# ```python
- # net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
# ie = IECore()
+ # net = ie.read_network(model=path_to_xml_file, weights=path_to_bin_file)
# layers_map = ie.query_network(network=net, device_name="HETERO:GPU,CPU")
# ```
def query_network(self, IENetwork network, str device_name, config=None):
return c_map_to_dict(res)
## Sets a configuration for a plugin
- # NOTE: When specifying a key value of a config, the "KEY_" prefix is omitted.
+ #
+ # \note When specifying a key value of a config, the "KEY_" prefix is omitted.
+ #
# @param config: a dictionary of configuration parameters as keys and their values
# @param device_name: a device name of a target plugin
# @return None
#
- # Usage examples: See the `set_affinity` method of the `IENetwork` class
+ # Usage examples:\n
+ # ```python
+ # ie = IECore()
+ # net = ie.read_network(model=path_to_xml_file, weights=path_to_bin_file)
+ # ie.set_config({"DYN_BATCH_ENABLED": "YES"})
+ # ```
def set_config(self, config: dict, device_name: str):
cdef map[string, string] c_config = dict_to_c_map(config)
self.impl.setConfig(c_config, device_name.encode())
## Gets a configuration dedicated to device behavior. The method targets to extract information
# which can be set via set_config method.
- # NOTE: When specifying a key value of a config, the "KEY_" prefix is omitted.
+ #
+ # \note When specifying a key value of a config, the "KEY_" prefix is omitted.
+ #
# @param device_name: A name of a device to get a config value.
# @param config_name: A config name to request.
# @return A config value corresponding to a config key.
#
# Usage example:\n
# ```python
- # net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
# ie_core = IECore()
+ # net = ie_core.read_network(model=path_to_xml_file, weights=path_to_bin_file)
# exec_net = ie_core.load_network(net, device, num_requests=2)
# res = exec_net.infer({'data': img})
# res
#
# Usage example:\n
# ```python
- # net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
# ie_core = IECore()
+ # net = ie_core.read_network(model=path_to_xml_file, weights=path_to_bin_file)
# exec_net = ie_core.load_network(net, device, num_requsts=2)
# exec_graph = exec_net.get_exec_graph_info()
# ```
# Usage example:\n
# ```python
# ie = IECore()
- # net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
+ # net = ie.read_network(model=path_to_xml_file, weights=path_to_bin_file)
# exec_net = ie.load_network(net, "CPU")
# exec_net.get_metric("NETWORK_NAME")
# ```
# Usage example:\n
# ```python
# ie = IECore()
- # net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
+ # net = ie.read_network(model=path_to_xml_file, weights=path_to_bin_file)
# exec_net = ie.load_network(net, "CPU")
# exec_net.get_metric("DEVICE_ID")
# ```
# @return None
#
# ```python
- # net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
# ie = IECore()
+ # net = ie.read_network(model=path_to_xml_file, weights=path_to_bin_file)
# exec_net = ie.load_network(network=net, device_name="MYRIAD", num_requsts=2)
# exec_net.export(path_to_file_to_save)
# ```
# Usage example:\n
# ```python
# callback = lambda status, py_data: print("Request with id {} finished with status {}".format(py_data, status))
- # net = IENetwork("./model.xml", "./model.bin")
# ie = IECore()
+ # net = ie.read_network(model="./model.xml", weights="./model.bin")
# exec_net = ie.load_network(net, "CPU", num_requests=4)
# for id, req in enumerate(exec_net.requests):
# req.set_completion_callback(py_callback=callback, py_data=id)
#
# Usage example:\n
# ```python
- # exec_net = plugin.load(network=net, num_requests=2)
+ # exec_net = ie_core.load_network(network=net, num_requests=2)
# exec_net.requests[0].infer({input_blob: image})
# res = exec_net.requests[0].outputs['prob']
# np.flip(np.sort(np.squeeze(res)),0)
#
# Usage example:\n
# ```python
- # exec_net = plugin.load(network=net, num_requests=2)
+ # exec_net = ie_core.load_network(network=net, num_requests=2)
# exec_net.requests[0].async_infer({input_blob: image})
# request_status = exec_net.requests[0].wait()
# res = exec_net.requests[0].outputs['prob']
## Waits for the result to become available. Blocks until specified timeout elapses or the result
# becomes available, whichever comes first.
- # NOTE: There are special values of the timeout parameter:
+ #
+ # \note There are special values of the timeout parameter:
# * 0 - Immediately returns the inference status. It does not block or interrupt execution.
# To find statuses meaning, please refer to InferenceEngine::StatusCode in Inference Engine C++ documentation
# * -1 - Waits until inference result becomes available (default value)
return deref(self.impl).wait(<int64_t> timeout)
## Queries performance measures per layer to get feedback of what is the most time consuming layer.
- # NOTE: Performance counters data and format depends on the plugin
+ #
+ # \note Performance counters data and format depends on the plugin
+ #
# @return Dictionary containing per-layer execution information.
#
# Usage example:
# ```python
- # exec_net = plugin.load(network=net, num_requests=2)
+ # exec_net = ie_core.load_network(network=net, num_requests=2)
# exec_net.requests[0].infer({input_blob: image})
# exec_net.requests[0].get_perf_counts()
# {'Conv2D': {'exec_type': 'jit_avx2_1x1',
## Sets new batch size for certain infer request when dynamic batching is enabled in executable network
# that created this request.
- # NOTE: Support of dynamic batch size depends on the target plugin.
+ #
+ # \note Support of dynamic batch size depends on the target plugin.
#
# @param size: New batch size to be used by all the following inference calls for this request
# @return None
#
# Usage example:\n
# ```python
- # net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
+ # ie = IECore()
+ # net = ie.read_network(model=path_to_xml_file, weights=path_to_bin_file)
# # Set max batch size
# net.batch = 10
- # plugin.set_config({"DYN_BATCH_ENABLED": "YES"})
- # exec_net = plugin.load(network=net)
+ # ie.set_config({"DYN_BATCH_ENABLED": "YES"})
+ # exec_net = ie.load_network(network=net)
# # Set batch size for certain network.
# # NOTE: Input data shape will not be changed, but will be used partially in inference which increases performance
# exec_net.requests[0].set_batch(2)
def type(self):
return deref(self._ptr).type.decode()
- ## Layer base operating precision. Provides getter and setter interfaces.
+ ## \note This property is deprecated.
+ # Please, use out_data property to access DataPtr objects for all output ports, which contains full
+ # information about layer's output data including precision.
+ #
+ # Layer base operating precision. Provides getter and setter interfaces.
@property
def precision(self):
warnings.filterwarnings("always", category=DeprecationWarning)
# The affinity attribute provides getter and setter interfaces, so the layer affinity can be modified directly.
# For example:\n
# ```python
- # net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
# ie = IECore()
+ # net = ie.read_network(model=path_to_xml_file, weights=path_to_bin_file)
# layers_map = ie.query_network(network=net, device_name="HETERO:GPU,CPU")
# layers = net.layers
# for layer, device in layers_map.items():
input_to_list.append(deref(layer.second).name.decode())
return input_to_list
- ## Deprecated: use out_data property to access DataPtr objects for all output ports, which contains full
+ ## \note This property is deprecated.
+ # Please, use out_data property to access DataPtr objects for all output ports, which contains full
# information about layer's output data including layout
+ #
# Returns the layout of the layer output data on 1st port
@property
def layout(self):
cdef C.DataPtr c_input = deref(self._ptr).outData[0]
return layout_int_to_str_map[deref(c_input).getLayout()]
- ## Deprecated: use out_data property to access DataPtr objects for all output ports, which contains full
+ ## \note This property is deprecated.
+ # Please, use out_data property to access DataPtr objects for all output ports, which contains full
# information about layer's output data including shape
+ #
# Return the list of dimension of the layer output data on 1st port
@property
def shape(self):
weights_buffer.reset(blob.second)
blobs_map[blob.first.decode()] = weights_buffer.to_numpy()
return blobs_map
- ## Dictionary with layer weights, biases or custom blobs if any
+ ## \note This property is deprecated.
+ # Please use blobs property instead.
+ #
+ # Dictionary with layer weights, biases or custom blobs if any
@property
def weights(self):
warnings.filterwarnings("always", category=DeprecationWarning)
cdef class IENetwork:
## Class constructor
#
+ # \note Reading networks using IENetwork constructor is deprecated.
+ # Please, use IECore.read_network() method instead.
+ #
# @param model: A `.xml` file of the IR or PyCapsule containing smart pointer to nGraph function.
# In case of passing a `.xml` file attribute value can be a string path or bytes with file content
# depending on `init_from_buffer` attribute value
## Batch size of the network. Provides getter and setter interfaces to get and modify the
# network batch size. For example:\n
# ```python
- # net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
- # print(et.batch_size)
+ # ie = IECore()
+ # net = ie.read_network(model=path_to_xml_file, weights=path_to_bin_file)
+ # print(net.batch_size)
# net.batch_size = 4
# print(net.batch_size)
# print(net.inputs['data'].shape)
@property
def batch_size(self):
return self.impl.getBatch()
- ## Deprecated: network precision does not make sence, use precision on egdes.
+ ## \note This property is deprecated:
+ # network precision does not make sense, use precision on edges.
+ #
# Precision of the network
@property
def precision(self):
layers[deref(l).name.decode()] = net_l
return layers
- ## Deprecated: new Calibration Tool doesn't generate statistics
+ ## \note This property is deprecated.
+ # New Calibration Tool doesn't generate statistics
+ #
# Returns `LayersStatsMap` object containing dictionary that maps network layer names to calibration statistics
# represented by `LayerStats` objects.
#
# Usage example:\n
# ```python
- # net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
+ # ie = IECore()
+ # net = ie.read_network(model=path_to_xml_file, weights=path_to_bin_file)
# net.stats.update({"conv1_2d" : LayserStats(min=(-25, -1, 0), max=(63, 124, 70)),
# "conv2_2d" : LayserStats(min=(-5, -1, 0, 1, -7, 2), max=(63, 124, 70, 174, 99, 106))
# })
max=tuple(it.second["max".encode()]))
return py_stats_map
- ## NOTE: The function is deprecated. Please use the `IENetwork()` class constructor
- # to create valid instance of `IENetwork`.
- #
- # Reads the model from the `.xml` and `.bin` files of the IR.
- #
- # @param model: Path to `.xml` file of the IR
- # @param weights: Path to `.bin` file of the IR
- # @return An instance of the `IENetwork` class
- @classmethod
- def from_ir(cls, model: str, weights: str):
- warnings.filterwarnings("always", category=DeprecationWarning)
- warnings.warn("from_ir() method of IENetwork is deprecated. "
- "Please use IENetwork class constructor to create valid IENetwork instance",
- DeprecationWarning)
- if not os.path.isfile(model):
- raise Exception("Path to the model {} doesn't exists or it's a directory".format(model))
- if not os.path.isfile(weights):
- raise Exception("Path to the weights {} doesn't exists or it's a directory".format(weights))
- cdef IENetwork net = IENetwork(model, weights)
- return net
## Marks any intermediate layer as output layer to retrieve the inference results from the specified layers.
# @param outputs: List of layers to be set as model outputs. The list can contain strings with layer names to be set
#
# Usage example:\n
# ```python
- # net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
+ # ie = IECore()
+ # net = ie.read_network(model=path_to_xml_file, weights=path_to_bin_file)
# net.add_outputs(["conv5_1', conv2_1', (split_2, 1)])]
# ```
def add_outputs(self, outputs):
#
# Usage example:
# ```python
- # net = IENetwork(model=path_to_model, weights=path_to_weights)
+ # ie = IECore()
+ # net = ie.read_network(model=path_to_xml, weights=path_to_bin)
# net.serialize(path_to_xml, path_to_bin)
# ```
def serialize(self, path_to_xml, path_to_bin: str = ""):
self.impl.serialize(path_to_xml.encode(), path_to_bin.encode())
## Reshapes the network to change spatial dimensions, batch size, or any dimension.
- # NOTE: Before using this method, make sure that the target shape is applicable for the network.
+ #
+ # \note Before using this method, make sure that the target shape is applicable for the network.
# Changing the network shape to an arbitrary value may lead to unpredictable behaviour.
#
# @param input_shapes: A dictionary that maps input layer names to tuples with the target shape
#
# Usage example:\n
# ```python
- # net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
+ # ie = IECore()
+ # net = ie.read_network(model=path_to_xml_file, weights=path_to_bin_file)
# input_layer = next(iter(net.inputs))
# n, c, h, w = net.inputs[input_layer]
# net.reshape({input_layer: (n, c, h*2, w*2)}]
# return self.impl.getFunction()
## This class is the main plugin interface and serves to initialize and configure the plugin.
+#
+#\note This class is deprecated: Use IECore instead
+#
cdef class IEPlugin:
- ## Deprecated: Use IECore instead
- # Class constructor
+ ## Class constructor
#
# @param device: Target device name. Supported devices: CPU, GPU, FPGA, MYRIAD, HETERO, MULTI
# @param plugin_dirs: List of paths to plugin directories
/**
* @brief A default constructor
*/
- CNNNetReader(): actual(shared_from_irelease(InferenceEngine::CreateCNNNetReader())) {
+ CNNNetReader(): actual(InferenceEngine::CreateCNNNetReaderPtr()) {
if (actual == nullptr) {
THROW_IE_EXCEPTION << "CNNNetReader was not initialized.";
}
}
private:
- std::shared_ptr<ICNNNetReader> actual;
+ CNNNetReaderPtr actual;
std::shared_ptr<CNNNetwork> network;
};
IE_SUPPRESS_DEPRECATED_END
* @param reader Pointer to the ICNNNetReader object
*/
IE_SUPPRESS_DEPRECATED_START
- explicit CNNNetwork(std::shared_ptr<ICNNNetReader> reader): reader(reader), actual(reader->getNetwork(nullptr)) {
- if (actual == nullptr) {
+ explicit CNNNetwork(CNNNetReaderPtr reader_): reader(reader_) {
+ if (reader == nullptr) {
+ THROW_IE_EXCEPTION << "ICNNNetReader was not initialized.";
+ }
+ if ((actual = reader->getNetwork(nullptr)) == nullptr) {
THROW_IE_EXCEPTION << "CNNNetwork was not initialized.";
}
}
}
/**
+ * @brief An overloaded operator cast to get pointer on current network
+ *
+ * @return A shared pointer of the current network
+ */
+ operator std::shared_ptr<ICNNNetwork>() {
+ return network;
+ }
+
+ /**
* @brief An overloaded operator & to get current network
*
* @return An instance of the current network
*
* @return constant nGraph function
*/
+ std::shared_ptr<ngraph::Function> getFunction() noexcept {
+ return actual->getFunction();
+ }
+
+ /**
+ * @brief Returns constant nGraph function
+ *
+ * @return constant nGraph function
+ */
std::shared_ptr<const ngraph::Function> getFunction() const noexcept {
return actual->getFunction();
}
* @brief Reader extra reference, might be nullptr
*/
IE_SUPPRESS_DEPRECATED_START
- std::shared_ptr<ICNNNetReader> reader;
+ CNNNetReaderPtr reader;
IE_SUPPRESS_DEPRECATED_END
/**
* @brief Network extra interface, might be nullptr
IE_SUPPRESS_DEPRECATED_END
}
+private:
/**
* @brief Loads function from the library and returns a pointer to it
* @param functionName Name of function to load
}
/**
+ * @brief Constructs an object with existing loader
+ * @param so_loader Existing pointer to a library loader
+ */
+ explicit SOPointer(std::shared_ptr<Loader> so_loader)
+ : _so_loader(so_loader),
+ _pointedObj(details::shared_from_irelease(
+ SymbolLoader<Loader>(_so_loader).template instantiateSymbol<T>(SOCreatorTrait<T>::name))) {}
+
+ /**
* @brief The copy-like constructor, can create So Pointer that dereferenced into child type if T is derived of U
* @param that copied SOPointer object
*/
* @brief Gets a smart pointer to the DLL
*/
std::shared_ptr<Loader> _so_loader;
+
/**
* @brief Gets a smart pointer to the custom object
*/
#include <map>
#include <string>
+#include <vector>
#include "details/ie_no_copy.hpp"
+#include "details/ie_so_pointer.hpp"
#include "ie_api.h"
#include "ie_blob.h"
#include "ie_common.h"
* @return IR version number: 1 or 2
*/
virtual int getVersion(ResponseDesc* resp) noexcept = 0;
+
+ virtual void addExtensions(const std::vector<InferenceEngine::IExtensionPtr>& ext) = 0;
+
+ /**
+ * @brief A virtual destructor.
+ */
+ ~ICNNNetReader() override = default;
+};
+
+IE_SUPPRESS_DEPRECATED_START
+
+namespace details {
+
+/**
+ * @brief This class defines the name of the fabric for creating an IHeteroInferencePlugin object in DLL
+ */
+template<>
+class SOCreatorTrait<ICNNNetReader> {
+public:
+ /**
+ * @brief A name of the fabric for creating IInferencePlugin object in DLL
+ */
+ static constexpr auto name = "CreateICNNNetReader";
};
+} // namespace details
+
+/**
+ * @brief A C++ helper to work with objects created by the IR readers plugin.
+ * Implements different interfaces.
+ */
+using CNNNetReaderPtr = InferenceEngine::details::SOPointer<ICNNNetReader, InferenceEngine::details::SharedObjectLoader>;
+
/**
* @brief Creates a CNNNetReader instance
- *
* @return An object that implements the ICNNNetReader interface
*/
-IE_SUPPRESS_DEPRECATED_START
-INFERENCE_ENGINE_API(ICNNNetReader*) CreateCNNNetReader() noexcept;
+INFERENCE_ENGINE_API_CPP(CNNNetReaderPtr) CreateCNNNetReaderPtr() noexcept;
+
IE_SUPPRESS_DEPRECATED_END
+
} // namespace InferenceEngine
using Ptr = std::shared_ptr<ICNNNetwork>;
/**
+ * @brief Returns nGraph function
+ * @return nGraph function
+ */
+ virtual std::shared_ptr<ngraph::Function> getFunction() noexcept = 0;
+
+ /**
* @brief Returns constant nGraph function
* @return constant nGraph function
*/
/**
* @deprecated Migrate to IR v10 and work with ngraph::Function directly. The method will be removed in 2020.3
- * @brief This class represents a standard Scatter layer
+ * @brief This class represents a standard ScatterUpdate layer
*/
-class INFERENCE_ENGINE_INTERNAL_CNNLAYER_CLASS(ScatterLayer): public CNNLayer {
+class INFERENCE_ENGINE_INTERNAL_CNNLAYER_CLASS(ScatterUpdateLayer): public CNNLayer {
public:
/**
- * @brief The axis in Dictionary to scatter Indexes from
- */
- int axis = 0;
- /**
- * @brief Creates a new ScatterLayer instance.
+ * @brief Creates a new ScatterUpdateLayer instance.
*/
using CNNLayer::CNNLayer;
- ~ScatterLayer() override;
+ ~ScatterUpdateLayer() override;
};
/**
+ * @deprecated Migrate to IR v10 and work with ngraph::Function directly. The method will be removed in 2020.3
* @brief This class represents an onnx ExperimentalDetectronPriorGridGenerator Layer
*/
class INFERENCE_ENGINE_INTERNAL_CNNLAYER_CLASS(ExperimentalDetectronPriorGridGeneratorLayer): public CNNLayer {
};
/**
+ * @brief This class represents a standard ExperimentalDetectronTopKROIs layer
+ */
+class INFERENCE_ENGINE_INTERNAL_CNNLAYER_CLASS(ExperimentalDetectronTopKROIs): public CNNLayer {
+public:
+ /**
+ * @brief The maximum number of output rois
+ */
+ int max_rois = 0;
+ /**
+ * @brief Creates a new ExperimentalDetectronTopKROIs instance.
+ */
+ using CNNLayer::CNNLayer;
+
+ virtual ~ExperimentalDetectronTopKROIs();
+};
+
+/**
* @brief This class represents an onnx ExperimentalDetectronGenerateProposalsSingleImage Layer
*/
class INFERENCE_ENGINE_INTERNAL_CNNLAYER_CLASS(ExperimentalDetectronGenerateProposalsSingleImageLayer): public CNNLayer {
#include <vector>
#include "ie_api.h"
+#include "ie_blob.h"
namespace ngraph {
};
#ifdef __clang__
+extern template struct INFERENCE_ENGINE_API_CLASS(InferenceEngine::Parameter::RealData<InferenceEngine::Blob::Ptr>);
extern template struct INFERENCE_ENGINE_API_CLASS(InferenceEngine::Parameter::RealData<int>);
extern template struct INFERENCE_ENGINE_API_CLASS(InferenceEngine::Parameter::RealData<bool>);
extern template struct INFERENCE_ENGINE_API_CLASS(InferenceEngine::Parameter::RealData<float>);
*/
DECLARE_CONFIG_KEY(DUMP_EXEC_GRAPH_AS_DOT);
+
+/**
+ * @brief The name for setting to execute in bfloat16 precision whenever it is possible
+ *
+ * This option let plugin know to downscale the precision where it see performance benefits from
+ * bfloat16 execution
+ * Such option do not guarantee accuracy of the network, the accuracy in this mode should be
+ * verified separately by the user and basing on performance and accuracy results it should be
+ * user's decision to use this option or not to use
+ */
+DECLARE_CONFIG_KEY(ENFORCE_BF16);
+
} // namespace PluginConfigParams
} // namespace InferenceEngine
UNSPECIFIED = 255, /**< Unspecified value. Used by default */
MIXED = 0, /**< Mixed value. Can be received from network. No applicable for tensors */
FP32 = 10, /**< 32bit floating point value */
- FP16 = 11, /**< 16bit floating point value */
+ FP16 = 11, /**< 16bit floating point value, 5 bit for exponent, 10 bit for mantisa */
+ BF16 = 12, /**< 16bit floating point value, 8 bit for exponent, 7 bit for mantisa*/
Q78 = 20, /**< 16bit specific signed fixed point precision */
I16 = 30, /**< 16bit signed integer value */
U8 = 40, /**< 8bit unsigned integer value */
switch (precisionInfo.value) {
CASE(FP32, float);
CASE2(FP16, int16_t, uint16_t);
+ CASE2(BF16, int16_t, uint16_t);
CASE(I16, int16_t);
CASE(I32, int32_t);
CASE(I64, int64_t);
static std::unordered_map<std::string, ePrecision> names = {
#define PRECISION_NAME(s) {#s, s}
PRECISION_NAME(Q78), PRECISION_NAME(U8), PRECISION_NAME(I8), PRECISION_NAME(I16),
- PRECISION_NAME(I32), PRECISION_NAME(I64), PRECISION_NAME(U64), PRECISION_NAME(U16),
+ PRECISION_NAME(I32), PRECISION_NAME(I64), PRECISION_NAME(U64), PRECISION_NAME(U16),
PRECISION_NAME(FP32), PRECISION_NAME(FP16), PRECISION_NAME(MIXED), PRECISION_NAME(BIN),
- PRECISION_NAME(BOOL),
+ PRECISION_NAME(BOOL), PRECISION_NAME(BF16),
#undef PRECISION_NAME
};
auto i = names.find(str);
switch (v) {
CASE(FP32);
CASE(FP16);
+ CASE(BF16);
CASE(I16);
CASE(I32);
CASE(I64);
using value_type = int16_t;
};
template <>
+struct PrecisionTrait<Precision::BF16> {
+ using value_type = int16_t;
+};
+template<>
struct PrecisionTrait<Precision::Q78> {
using value_type = uint16_t;
};
Please note that although the automatic selection usually provides a reasonable performance,
it still may be non-optimal for some cases, especially for very small networks.
-nthreads "<integer>" Optional. Number of threads to use for inference on the CPU (including HETERO and MULTI cases).
- -pin "YES"/"NUMA"/"NO" Optional. Enable threads->cores ("YES", default), threads->(NUMA)nodes ("NUMA") or completely disable ("NO")
- CPU threads pinning for CPU-involved inference.
+ -enforcebf16 Optional. Enforcing of floating point operations execution in bfloat16 precision where it is acceptable.
+ -pin "YES"/"NO"/"NUMA" Optional. Enable threads->cores ("YES", default), threads->(NUMA)nodes ("NUMA") or completely disable ("NO") CPU threads pinning for CPU-involved inference.
+
Statistics dumping options:
-report_type "<type>" Optional. Enable collecting statistics report. "no_counters" report contains configuration options specified, resulting FPS and latency. "average_counters" report extends "no_counters" report and additionally includes average PM counters values for each layer from the network. "detailed_counters" report extends "average_counters" report and additionally includes per-layer PM counters and latency for each executed infer request.
-report_folder Optional. Path to a folder where statistics report is stored.
-exec_graph_path Optional. Path to a file where to store executable graph information serialized.
-pc Optional. Report performance counters.
+ -dump_config Optional. Path to XML/YAML/JSON file to dump IE parameters, which were set by application.
+ -load_config Optional. Path to XML/YAML/JSON file to load custom IE parameters. Please note, command line parameters have higher priority then parameters from configuration file.
```
Running the application with the empty list of options yields the usage message given above and an error message.
"usually provides a reasonable performance, it still may be non - optimal for some cases, especially for "
"very small networks. See sample's README for more details.";
+/// @brief message for enforcing of BF16 execution where it is possible
+static const char enforce_bf16_message[] = "Optional. Enforcing of floating point operations execution in bfloat16 precision where it is acceptable.";
+
/// @brief message for user library argument
static const char custom_cpu_library_message[] = "Required for CPU custom layers. Absolute path to a shared library with the kernels implementations.";
// @brief message for performance counters option
static const char pc_message[] = "Optional. Report performance counters.";
+#ifdef USE_OPENCV
+// @brief message for load config option
+static const char load_config_message[] = "Optional. Path to XML/YAML/JSON file to load custom IE parameters."
+ " Please note, command line parameters have higher priority then parameters from configuration file.";
+
+// @brief message for dump config option
+static const char dump_config_message[] = "Optional. Path to XML/YAML/JSON file to dump IE parameters, which were set by application.";
+#endif
+
/// @brief Define flag for showing help message <br>
DEFINE_bool(h, false, help_message);
/// @brief Number of streams to use for inference on the CPU (also affects Hetero cases)
DEFINE_string(nstreams, "", infer_num_streams_message);
+/// @brief Enforces bf16 execution with bfloat16 precision on systems having this capability
+DEFINE_bool(enforcebf16, false, enforce_bf16_message);
+
/// @brief Define parameter for batch size <br>
/// Default is 0 (that means don't specify)
DEFINE_uint32(b, 0, batch_size_message);
/// @brief Define flag for showing performance counters <br>
DEFINE_bool(pc, false, pc_message);
+#ifdef USE_OPENCV
+/// @brief Define flag for loading configuration file <br>
+DEFINE_string(load_config, "", load_config_message);
+
+/// @brief Define flag for dumping configuration file <br>
+DEFINE_string(dump_config, "", dump_config_message);
+#endif
+
/**
* @brief This function show a help message
*/
std::cout << std::endl << " device-specific performance options:" << std::endl;
std::cout << " -nstreams \"<integer>\" " << infer_num_streams_message << std::endl;
std::cout << " -nthreads \"<integer>\" " << infer_num_threads_message << std::endl;
- std::cout << " -pin \"YES\"/\"NO\" " << infer_threads_pinning_message << std::endl;
+ std::cout << " -enforcebf16 " << enforce_bf16_message << std::endl;
+ std::cout << " -pin \"YES\"/\"NO\"/\"NUMA\" " << infer_threads_pinning_message << std::endl;
std::cout << std::endl << " Statistics dumping options:" << std::endl;
std::cout << " -report_type \"<type>\" " << report_type_message << std::endl;
std::cout << " -report_folder " << report_folder_message << std::endl;
std::cout << " -exec_graph_path " << exec_graph_path_message << std::endl;
std::cout << " -pc " << pc_message << std::endl;
+#ifdef USE_OPENCV
+ std::cout << " -dump_config " << dump_config_message << std::endl;
+ std::cout << " -load_config " << load_config_message << std::endl;
+#endif
}
}
if (!FLAGS_report_type.empty() &&
- FLAGS_report_type != noCntReport && FLAGS_report_type != averageCntReport && FLAGS_report_type != detailedCntReport) {
+ FLAGS_report_type != noCntReport && FLAGS_report_type != averageCntReport && FLAGS_report_type != detailedCntReport) {
std::string err = "only " + std::string(noCntReport) + "/" + std::string(averageCntReport) + "/" + std::string(detailedCntReport) +
- " report types are supported (invalid -report_type option value)";
+ " report types are supported (invalid -report_type option value)";
throw std::logic_error(err);
}
static void next_step(const std::string additional_info = "") {
static size_t step_id = 0;
static const std::map<size_t, std::string> step_names = {
- { 1, "Parsing and validating input arguments" },
- { 2, "Loading Inference Engine" },
- { 3, "Setting device configuration" },
- { 4, "Reading the Intermediate Representation network" },
- { 5, "Resizing network to match image sizes and given batch" },
- { 6, "Configuring input of the model" },
- { 7, "Loading the model to the device" },
- { 8, "Setting optimal runtime parameters" },
- { 9, "Creating infer requests and filling input blobs with images" },
- { 10, "Measuring performance" },
- { 11, "Dumping statistics report" }
+ { 1, "Parsing and validating input arguments" },
+ { 2, "Loading Inference Engine" },
+ { 3, "Setting device configuration" },
+ { 4, "Reading the Intermediate Representation network" },
+ { 5, "Resizing network to match image sizes and given batch" },
+ { 6, "Configuring input of the model" },
+ { 7, "Loading the model to the device" },
+ { 8, "Setting optimal runtime parameters" },
+ { 9, "Creating infer requests and filling input blobs with images" },
+ { 10, "Measuring performance" },
+ { 11, "Dumping statistics report" }
};
step_id++;
slog::info << "Network is compiled" << slog::endl;
}
- if (!FLAGS_report_type.empty()) {
- std::vector<gflags::CommandLineFlagInfo> flags;
- StatisticsReport::Parameters command_line_arguments;
- gflags::GetAllFlags(&flags);
-
- for (auto &flag : flags) {
- if (!flag.is_default) {
- command_line_arguments.push_back({ flag.name, flag.current_value });
- }
+ std::vector<gflags::CommandLineFlagInfo> flags;
+ StatisticsReport::Parameters command_line_arguments;
+ gflags::GetAllFlags(&flags);
+ for (auto &flag : flags) {
+ if (!flag.is_default) {
+ command_line_arguments.push_back({ flag.name, flag.current_value });
}
+ }
+ if (!FLAGS_report_type.empty()) {
statistics = std::make_shared<StatisticsReport>(StatisticsReport::Config{FLAGS_report_type, FLAGS_report_folder});
statistics->addParameters(StatisticsReport::Category::COMMAND_LINE_PARAMETERS, command_line_arguments);
}
+ auto isFlagSetInCommandLine = [&command_line_arguments] (const std::string& name) {
+ return (std::find_if(command_line_arguments.begin(), command_line_arguments.end(),
+ [ name ] (const std::pair<std::string, std::string>& p) { return p.first == name;}) != command_line_arguments.end());
+ };
+
+ std::string device_name = FLAGS_d;
+
+ // Parse devices
+ auto devices = parseDevices(device_name);
+ // Parse nstreams per device
+ std::map<std::string, std::string> device_nstreams = parseNStreamsValuePerDevice(devices, FLAGS_nstreams);
+
+ // Load device config file if specified
+ std::map<std::string, std::map<std::string, std::string>> config;
+#ifdef USE_OPENCV
+ if (!FLAGS_load_config.empty()) {
+ load_config(FLAGS_load_config, config);
+ }
+#endif
/** This vector stores paths to the processed images **/
std::vector<std::string> inputFiles;
parseInputFilesArguments(inputFiles);
- if (FLAGS_nstreams.empty()) {
- slog::warn << "-nstreams default value is determined automatically for a device. "
- "Although the automatic selection usually provides a reasonable performance,"
- "but it still may be non-optimal for some cases, for more information look at README." << slog::endl<< slog::endl;
- }
-
// ----------------- 2. Loading the Inference Engine -----------------------------------------------------------
next_step();
- // Get optimal runtime parameters for device
- std::string device_name = FLAGS_d;
-
Core ie;
-
if (FLAGS_d.find("CPU") != std::string::npos && !FLAGS_l.empty()) {
// CPU (MKLDNN) extensions is loaded as a shared library and passed as a pointer to base extension
const auto extension_ptr = InferenceEngine::make_so_pointer<InferenceEngine::IExtension>(FLAGS_l);
slog::info << "CPU (MKLDNN) extensions is loaded " << FLAGS_l << slog::endl;
}
+ // Load clDNN Extensions
if ((FLAGS_d.find("GPU") != std::string::npos) && !FLAGS_c.empty()) {
- // Load clDNN Extensions
- ie.SetConfig({ {CONFIG_KEY(CONFIG_FILE), FLAGS_c} });
- slog::info << "GPU extensions is loaded " << FLAGS_c << slog::endl;
+ // Override config if command line parameter is specified
+ if (!config.count("GPU"))
+ config["GPU"] = {};
+ config["GPU"][CONFIG_KEY(CONFIG_FILE)] = FLAGS_c;
+ }
+ if (config.count("GPU") && config.at("GPU").count(CONFIG_KEY(CONFIG_FILE))) {
+ auto ext = config.at("GPU").at(CONFIG_KEY(CONFIG_FILE));
+ ie.SetConfig({{ CONFIG_KEY(CONFIG_FILE), ext }}, "GPU");
+ slog::info << "GPU extensions is loaded " << ext << slog::endl;
}
slog::info << "InferenceEngine: " << GetInferenceEngineVersion() << slog::endl;
// ----------------- 3. Setting device configuration -----------------------------------------------------------
next_step();
- bool perf_counts = (FLAGS_report_type == detailedCntReport ||
- FLAGS_report_type == averageCntReport ||
- FLAGS_pc ||
- !FLAGS_exec_graph_path.empty());
-
- auto devices = parseDevices(device_name);
- std::map<std::string, uint32_t> device_nstreams = parseNStreamsValuePerDevice(devices, FLAGS_nstreams);
- for (auto& pair : device_nstreams) {
- auto key = std::string(pair.first + "_THROUGHPUT_STREAMS");
- std::vector<std::string> supported_config_keys = ie.GetMetric(pair.first, METRIC_KEY(SUPPORTED_CONFIG_KEYS));
- if (std::find(supported_config_keys.begin(), supported_config_keys.end(), key) == supported_config_keys.end()) {
- throw std::logic_error("Device " + pair.first + " doesn't support config key '" + key + "'! " +
- "Please specify -nstreams for correct devices in format <dev1>:<nstreams1>,<dev2>:<nstreams2>");
+ bool perf_counts = false;
+ // Update config per device according to command line parameters
+ for (auto& device : devices) {
+ if (!config.count(device)) config[device] = {};
+ std::map<std::string, std::string>& device_config = config.at(device);
+
+ // Set performance counter
+ if (isFlagSetInCommandLine("pc")) {
+ // set to user defined value
+ device_config[CONFIG_KEY(PERF_COUNT)] = FLAGS_pc ? CONFIG_VALUE(YES) : CONFIG_VALUE(NO);
+ } else if (device_config.count(CONFIG_KEY(PERF_COUNT)) &&
+ (device_config.at(CONFIG_KEY(PERF_COUNT)) == "YES")) {
+ slog::warn << "Performance counters for " << device <<
+ " device is turned on. To print results use -pc option." << slog::endl;
+ } else if (FLAGS_report_type == detailedCntReport || FLAGS_report_type == averageCntReport) {
+ slog::warn << "Turn on performance counters for " << device <<
+ " device since report type is " << FLAGS_report_type << "." << slog::endl;
+ device_config[CONFIG_KEY(PERF_COUNT)] = CONFIG_VALUE(YES);
+ } else if (!FLAGS_exec_graph_path.empty()) {
+ slog::warn << "Turn on performance counters for " << device <<
+ " device due to execution graph dumping." << slog::endl;
+ device_config[CONFIG_KEY(PERF_COUNT)] = CONFIG_VALUE(YES);
+ } else {
+ // set to default value
+ device_config[CONFIG_KEY(PERF_COUNT)] = FLAGS_pc ? CONFIG_VALUE(YES) : CONFIG_VALUE(NO);
}
- }
+ perf_counts = (device_config.at(CONFIG_KEY(PERF_COUNT)) == CONFIG_VALUE(YES)) ? true : perf_counts;
+
+ auto setThroughputStreams = [&] () {
+ const std::string key = device + "_THROUGHPUT_STREAMS";
+ if (device_nstreams.count(device)) {
+ // set to user defined value
+ std::vector<std::string> supported_config_keys = ie.GetMetric(device, METRIC_KEY(SUPPORTED_CONFIG_KEYS));
+ if (std::find(supported_config_keys.begin(), supported_config_keys.end(), key) == supported_config_keys.end()) {
+ throw std::logic_error("Device " + device + " doesn't support config key '" + key + "'! " +
+ "Please specify -nstreams for correct devices in format <dev1>:<nstreams1>,<dev2>:<nstreams2>" +
+ " or via configuration file.");
+ }
+ device_config[key] = device_nstreams.at(device);
+ } else if (!device_config.count(key) && (FLAGS_api == "async")) {
+ slog::warn << "-nstreams default value is determined automatically for " << device << " device. "
+ "Although the automatic selection usually provides a reasonable performance,"
+ "but it still may be non-optimal for some cases, for more information look at README." << slog::endl;
+ device_config[key] = std::string(device + "_THROUGHPUT_AUTO");
+ }
+ if (device_config.count(key))
+ device_nstreams[device] = device_config.at(key);
+ };
- for (auto& device : devices) {
if (device == "CPU") { // CPU supports few special performance-oriented keys
// limit threading for CPU portion of inference
- if (FLAGS_nthreads != 0)
- ie.SetConfig({{ CONFIG_KEY(CPU_THREADS_NUM), std::to_string(FLAGS_nthreads) }}, device);
-
- if ((device_name.find("MULTI") != std::string::npos) &&
- (device_name.find("GPU") != std::string::npos)) {
- ie.SetConfig({{ CONFIG_KEY(CPU_BIND_THREAD), CONFIG_VALUE(NO) }}, device);
- } else {
- // pin threads for CPU portion of inference
- ie.SetConfig({{ CONFIG_KEY(CPU_BIND_THREAD), FLAGS_pin }}, device);
+ if (isFlagSetInCommandLine("nthreads"))
+ device_config[CONFIG_KEY(CPU_THREADS_NUM)] = std::to_string(FLAGS_nthreads);
+
+ if (isFlagSetInCommandLine("enforcebf16"))
+ device_config[CONFIG_KEY(ENFORCE_BF16)] = FLAGS_enforcebf16 ? CONFIG_VALUE(YES) : CONFIG_VALUE(NO);
+
+ if (isFlagSetInCommandLine("pin")) {
+ // set to user defined value
+ device_config[CONFIG_KEY(CPU_BIND_THREAD)] = FLAGS_pin;
+ } else if (!device_config.count(CONFIG_KEY(CPU_BIND_THREAD))) {
+ if ((device_name.find("MULTI") != std::string::npos) &&
+ (device_name.find("GPU") != std::string::npos)) {
+ slog::warn << "Turn off threads pinning for " << device <<
+ " device since multi-scenario with GPU device is used." << slog::endl;
+ device_config[CONFIG_KEY(CPU_BIND_THREAD)] = CONFIG_VALUE(NO);
+ } else {
+ // set to default value
+ device_config[CONFIG_KEY(CPU_BIND_THREAD)] = FLAGS_pin;
+ }
}
// for CPU execution, more throughput-oriented execution via streams
- if (FLAGS_api == "async")
- ie.SetConfig({{ CONFIG_KEY(CPU_THROUGHPUT_STREAMS),
- (device_nstreams.count(device) > 0 ? std::to_string(device_nstreams.at(device)) :
- "CPU_THROUGHPUT_AUTO") }}, device);
- device_nstreams[device] = std::stoi(ie.GetConfig(device, CONFIG_KEY(CPU_THROUGHPUT_STREAMS)).as<std::string>());
+ setThroughputStreams();
} else if (device == ("GPU")) {
- if (FLAGS_api == "async")
- ie.SetConfig({{ CONFIG_KEY(GPU_THROUGHPUT_STREAMS),
- (device_nstreams.count(device) > 0 ? std::to_string(device_nstreams.at(device)) :
- "GPU_THROUGHPUT_AUTO") }}, device);
- device_nstreams[device] = std::stoi(ie.GetConfig(device, CONFIG_KEY(GPU_THROUGHPUT_STREAMS)).as<std::string>());
+ // for GPU execution, more throughput-oriented execution via streams
+ setThroughputStreams();
if ((device_name.find("MULTI") != std::string::npos) &&
(device_name.find("CPU") != std::string::npos)) {
- // multi-device execution with the CPU + GPU performs best with GPU trottling hint,
- // which releases another CPU thread (that is otherwise used by the GPU driver for active polling)
- ie.SetConfig({{ CLDNN_CONFIG_KEY(PLUGIN_THROTTLE), "1" }}, "GPU");
+ slog::warn << "Turn on GPU trottling. Multi-device execution with the CPU + GPU performs best with GPU trottling hint," <<
+ "which releases another CPU thread (that is otherwise used by the GPU driver for active polling)"<< slog::endl;
+ device_config[CLDNN_CONFIG_KEY(PLUGIN_THROTTLE)] = "1";
}
} else if (device == "MYRIAD") {
- ie.SetConfig({{ CONFIG_KEY(LOG_LEVEL), CONFIG_VALUE(LOG_WARNING) }}, device);
+ device_config[CONFIG_KEY(LOG_LEVEL)] = CONFIG_VALUE(LOG_WARNING);
}
}
+ for (auto&& item : config) {
+ ie.SetConfig(item.second, item.first);
+ }
+
auto double_to_string = [] (const double number) {
- std::stringstream ss;
- ss << std::fixed << std::setprecision(2) << number;
- return ss.str();
- };
+ std::stringstream ss;
+ ss << std::fixed << std::setprecision(2) << number;
+ return ss.str();
+ };
auto get_total_ms_time = [] (Time::time_point& startTime) {
return std::chrono::duration_cast<ns>(Time::now() - startTime).count() * 0.000001;
};
-
size_t batchSize = FLAGS_b;
Precision precision = Precision::UNSPECIFIED;
std::string topology_name = "";
if (statistics)
statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
{
- {"read network time (ms)", duration_ms}
+ {"read network time (ms)", duration_ms}
});
const InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
}
// ----------------- 7. Loading the model to the device --------------------------------------------------------
next_step();
-
- std::map<std::string, std::string> config = {{ CONFIG_KEY(PERF_COUNT), perf_counts ? CONFIG_VALUE(YES) :
- CONFIG_VALUE(NO) }};
startTime = Time::now();
- exeNetwork = ie.LoadNetwork(cnnNetwork, device_name, config);
+ exeNetwork = ie.LoadNetwork(cnnNetwork, device_name);
duration_ms = double_to_string(get_total_ms_time(startTime));
slog::info << "Load network took " << duration_ms << " ms" << slog::endl;
if (statistics)
statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
{
- {"load network time (ms)", duration_ms}
+ {"load network time (ms)", duration_ms}
});
} else {
next_step();
if (statistics)
statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
{
- {"import network time (ms)", duration_ms}
+ {"import network time (ms)", duration_ms}
});
if (batchSize == 0) {
batchSize = 1;
// ----------------- 8. Setting optimal runtime parameters -----------------------------------------------------
next_step();
+ // Update number of streams
+ for (auto&& ds : device_nstreams) {
+ const std::string key = ds.first + "_THROUGHPUT_STREAMS";
+ device_nstreams[ds.first] = ie.GetConfig(ds.first, key).as<std::string>();
+ }
+
// Number of requests
uint32_t nireq = FLAGS_nireq;
if (nireq == 0) {
if (statistics) {
statistics->addParameters(StatisticsReport::Category::RUNTIME_CONFIG,
{
- {"topology", topology_name},
- {"target device", device_name},
- {"API", FLAGS_api},
- {"precision", std::string(precision.name())},
- {"batch size", std::to_string(batchSize)},
- {"number of iterations", std::to_string(niter)},
- {"number of parallel infer requests", std::to_string(nireq)},
- {"duration (ms)", std::to_string(getDurationInMilliseconds(duration_seconds))},
+ {"topology", topology_name},
+ {"target device", device_name},
+ {"API", FLAGS_api},
+ {"precision", std::string(precision.name())},
+ {"batch size", std::to_string(batchSize)},
+ {"number of iterations", std::to_string(niter)},
+ {"number of parallel infer requests", std::to_string(nireq)},
+ {"duration (ms)", std::to_string(getDurationInMilliseconds(duration_seconds))},
});
for (auto& nstreams : device_nstreams) {
std::stringstream ss;
ss << "number of " << nstreams.first << " streams";
statistics->addParameters(StatisticsReport::Category::RUNTIME_CONFIG,
{
- {ss.str(), std::to_string(nstreams.second)},
+ {ss.str(), nstreams.second},
});
}
}
double latency = getMedianValue<double>(inferRequestsQueue.getLatencies());
double totalDuration = inferRequestsQueue.getDurationInMilliseconds();
double fps = (FLAGS_api == "sync") ? batchSize * 1000.0 / latency :
- batchSize * 1000.0 * iteration / totalDuration;
+ batchSize * 1000.0 * iteration / totalDuration;
if (statistics) {
statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
{
- {"total execution time (ms)", double_to_string(totalDuration)},
- {"total number of iterations", std::to_string(iteration)},
+ {"total execution time (ms)", double_to_string(totalDuration)},
+ {"total number of iterations", std::to_string(iteration)},
});
if (device_name.find("MULTI") == std::string::npos) {
statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
{
- {"latency (ms)", double_to_string(latency)},
+ {"latency (ms)", double_to_string(latency)},
});
}
statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
{
- {"throughput", double_to_string(fps)}
+ {"throughput", double_to_string(fps)}
});
}
// ----------------- 11. Dumping statistics report -------------------------------------------------------------
next_step();
+#ifdef USE_OPENCV
+ if (!FLAGS_dump_config.empty()) {
+ dump_config(FLAGS_dump_config, config);
+ slog::info << "Inference Engine configuration settings were dumped to " << FLAGS_dump_config << slog::endl;
+ }
+#endif
+
if (!FLAGS_exec_graph_path.empty()) {
try {
CNNNetwork execGraphInfo = exeNetwork.GetExecGraphInfo();
if (statistics) {
statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
{
- {"error", ex.what()},
+ {"error", ex.what()},
});
statistics->dump();
}
#include "utils.hpp"
+#ifdef USE_OPENCV
+#include <opencv2/core.hpp>
+#endif
+
uint32_t deviceDefaultDeviceDurationInSeconds(const std::string& device) {
static const std::map<std::string, uint32_t> deviceDefaultDurationInSeconds {
{ "CPU", 60 },
if (comma_separated_devices.find(":") != std::string::npos) {
comma_separated_devices = comma_separated_devices.substr(comma_separated_devices.find(":") + 1);
}
+ if ((comma_separated_devices == "MULTI") || (comma_separated_devices == "HETERO"))
+ return std::vector<std::string>();
auto devices = split(comma_separated_devices, ',');
for (auto& device : devices)
device = device.substr(0, device.find_first_of(".("));
return devices;
}
-std::map<std::string, uint32_t> parseNStreamsValuePerDevice(const std::vector<std::string>& devices,
- const std::string& values_string) {
+std::map<std::string, std::string> parseNStreamsValuePerDevice(const std::vector<std::string>& devices,
+ const std::string& values_string) {
// Format: <device1>:<value1>,<device2>:<value2> or just <value>
- auto values_string_upper = values_string;
- std::map<std::string, uint32_t> result;
- auto device_value_strings = split(values_string_upper, ',');
+ std::map<std::string, std::string> result;
+ auto device_value_strings = split(values_string, ',');
for (auto& device_value_string : device_value_strings) {
- auto device_value_vec = split(device_value_string, ':');
+ auto device_value_vec = split(device_value_string, ':');
if (device_value_vec.size() == 2) {
auto device_name = device_value_vec.at(0);
auto nstreams = device_value_vec.at(1);
auto it = std::find(devices.begin(), devices.end(), device_name);
if (it != devices.end()) {
- result[device_name] = std::stoi(nstreams);
+ result[device_name] = nstreams;
} else {
throw std::logic_error("Can't set nstreams value " + std::string(nstreams) +
" for device '" + device_name + "'! Incorrect device name!");
}
} else if (device_value_vec.size() == 1) {
- uint32_t value = std::stoi(device_value_vec.at(0));
+ auto value = device_value_vec.at(0);
for (auto& device : devices) {
result[device] = value;
}
}
return result;
}
+
+#ifdef USE_OPENCV
+void dump_config(const std::string& filename,
+ const std::map<std::string, std::map<std::string, std::string>>& config) {
+ cv::FileStorage fs(filename, cv::FileStorage::WRITE);
+ if (!fs.isOpened())
+ throw std::runtime_error("Error: Can't open config file : " + filename);
+ for (auto device_it = config.begin(); device_it != config.end(); ++device_it) {
+ fs << device_it->first << "{:";
+ for (auto param_it = device_it->second.begin(); param_it != device_it->second.end(); ++param_it)
+ fs << param_it->first << param_it->second;
+ fs << "}";
+ }
+ fs.release();
+}
+
+void load_config(const std::string& filename,
+ std::map<std::string, std::map<std::string, std::string>>& config) {
+ cv::FileStorage fs(filename, cv::FileStorage::READ);
+ if (!fs.isOpened())
+ throw std::runtime_error("Error: Can't load config file : " + filename);
+ cv::FileNode root = fs.root();
+ for (auto it = root.begin(); it != root.end(); ++it) {
+ auto device = *it;
+ if (!device.isMap()) {
+ throw std::runtime_error("Error: Can't parse config file : " + filename);
+ }
+ for (auto iit = device.begin(); iit != device.end(); ++iit) {
+ auto item = *iit;
+ config[device.name()][item.name()] = item.string();
+ }
+ }
+}
+#endif
\ No newline at end of file
std::vector<std::string> parseDevices(const std::string& device_string);
uint32_t deviceDefaultDeviceDurationInSeconds(const std::string& device);
-std::map<std::string, uint32_t> parseNStreamsValuePerDevice(const std::vector<std::string>& devices,
- const std::string& values_string);
+std::map<std::string, std::string> parseNStreamsValuePerDevice(const std::vector<std::string>& devices,
+ const std::string& values_string);
+#ifdef USE_OPENCV
+void dump_config(const std::string& filename,
+ const std::map<std::string, std::map<std::string, std::string>>& config);
+void load_config(const std::string& filename,
+ std::map<std::string, std::map<std::string, std::string>>& config);
+#endif
\ No newline at end of file
std::vector<std::string> availableDevices = ie.GetAvailableDevices();
// --------------------------- 3. Query and print supported metrics and config keys--------------------
- std::set<std::string> printedDevices;
std::cout << "Available devices: " << std::endl;
for (auto && device : availableDevices) {
- std::string deviceFamilyName = device.substr(0, device.find_first_of('.'));
- if (printedDevices.find(deviceFamilyName) == printedDevices.end())
- printedDevices.insert(deviceFamilyName);
- else
- continue;
-
- std::cout << "\tDevice: " << deviceFamilyName << std::endl;
+ std::cout << "\tDevice: " << device << std::endl;
std::cout << "\tMetrics: " << std::endl;
- std::vector<std::string> supportedMetrics = ie.GetMetric(deviceFamilyName, METRIC_KEY(SUPPORTED_METRICS));
+ std::vector<std::string> supportedMetrics = ie.GetMetric(device, METRIC_KEY(SUPPORTED_METRICS));
for (auto && metricName : supportedMetrics) {
- std::cout << "\t\t" << metricName << " : " << std::flush;
- printParameterValue(ie.GetMetric(device, metricName));
+ if (metricName != METRIC_KEY(AVAILABLE_DEVICES)) {
+ std::cout << "\t\t" << metricName << " : " << std::flush;
+ printParameterValue(ie.GetMetric(device, metricName));
+ }
}
- std::cout << "\tDefault values for device configuration keys: " << std::endl;
- std::vector<std::string> supportedConfigKeys = ie.GetMetric(deviceFamilyName, METRIC_KEY(SUPPORTED_CONFIG_KEYS));
- for (auto && configKey : supportedConfigKeys) {
- std::cout << "\t\t" << configKey << " : " << std::flush;
- printParameterValue(ie.GetConfig(deviceFamilyName, configKey));
+ if (std::find(supportedMetrics.begin(), supportedMetrics.end(),
+ METRIC_KEY(SUPPORTED_CONFIG_KEYS)) != supportedMetrics.end()) {
+ std::cout << "\tDefault values for device configuration keys: " << std::endl;
+ std::vector<std::string> supportedConfigKeys = ie.GetMetric(device, METRIC_KEY(SUPPORTED_CONFIG_KEYS));
+ for (auto && configKey : supportedConfigKeys) {
+ std::cout << "\t\t" << configKey << " : " << std::flush;
+ printParameterValue(ie.GetConfig(device, configKey));
+ }
}
std::cout << std::endl;
auto t0 = Time::now();
ExecutableNetwork executableNet;
+ ie.SetConfig(genericPluginConfig, deviceStr);
if (!FLAGS_m.empty()) {
slog::info << "Loading model to the device" << slog::endl;
- executableNet = ie.LoadNetwork(network, deviceStr, genericPluginConfig);
+ executableNet = ie.LoadNetwork(network, deviceStr);
} else {
slog::info << "Importing model to the device" << slog::endl;
- executableNet = ie.ImportNetwork(FLAGS_rg.c_str(), deviceStr, genericPluginConfig);
+ executableNet = ie.ImportNetwork(FLAGS_rg.c_str(), deviceStr);
}
ms loadTime = std::chrono::duration_cast<ms>(Time::now() - t0);
--- /dev/null
+#!/bin/bash
+
+CURRENT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+command -v realpath >/dev/null 2>&1 || { echo >&2 "cpplint require realpath executable but it's not installed. Aborting."; exit 1; }
+SOURCE_DIR=$(realpath ${CURRENT_DIR}/..)
+REPORT_DIR="${SOURCE_DIR}/report"
+CPPLINT_REPORT_DIR="${REPORT_DIR}/cpplint"
+PROJECT_NAME="Inference Engine"
+
+function run_cpplint() {
+ echo "-> CppLint started..."
+ if [ -d ${CPPLINT_REPORT_DIR} ]; then
+ rm -Rf ${CPPLINT_REPORT_DIR}
+ fi
+
+ mkdir -p ${CPPLINT_REPORT_DIR}
+ python ${CURRENT_DIR}/cpplint.py --linelength=160 --counting=detailed --quiet --filter="
+ -build/header_guard,
+ -build/include,
+ -build/include_order,
+ -build/include_subdir,
+ -build/include_what_you_use,
+ -build/namespaces,
+ -build/c++11,
+ -whitespace/indent,
+ -whitespace/comments,
+ -whitespace/ending_newline,
+ -runtime/references,
+ -runtime/int,
+ -runtime/explicit,
+ -readability/todo,
+ -readability/fn_size
+ " $(find ${SOURCE_DIR} -name '*.h' -or -name '*.cc' -or -name '*.c' -or -name '*.cpp' -or -name '*.hpp' |
+ grep -v 'inference-engine/bin\|inference-engine/build\|inference-engine/report\|inference-engine/scripts\|inference-engine/temp\|inference-engine/tests_deprecated/\|gtest\|inference-engine/ie_bridges\|pugixml\|inference-engine/tools/vpu_perfcheck\|thirdparty/gflags\|thirdparty/ade\|thirdparty/fluid\|thirdparty/mkl-dnn\|thirdparty/movidius\|thirdparty/ocv\|thirdparty/plugixml\|thirdparty/std_lib\|thirdparty/clDNN/common\|thirdparty/clDNN/tutorial\|thirdparty/clDNN/utils' |
+ grep 'include\|src\|inference-engine/samples\|thirdparty/clDNN/kernel_selector\|thirdparty/clDNN/api\|thirdparty/clDNN/api_extension\|inference-engine/tests_' ) 2>&1 |
+ sed 's/"/\"/g' >&1| sed 's/</\</g' >&1| sed 's/>/\>/g' >&1| sed "s/'/\'/g" >&1|
+ sed 's/\&/\&/g' >&1| python ${CURRENT_DIR}/cpplint_to_cppcheckxml.py &> ${CPPLINT_REPORT_DIR}/cpplint-cppcheck-result.xml
+
+ # Generate html from it
+ ${CURRENT_DIR}/cppcheck-htmlreport.py --file=${CPPLINT_REPORT_DIR}/cpplint-cppcheck-result.xml --report-dir=${CPPLINT_REPORT_DIR} --source-dir=${SOURCE_DIR} --title=${PROJECT_NAME}
+
+ # Change Cppcheck things to cpplint
+ sed -i.bak 's/Cppcheck/cpplint/g' ${CPPLINT_REPORT_DIR}/index.html
+ sed -i.bak 's/a\ tool\ for\ static\ C\/C++\ code\ analysis/an\ open\ source\ lint\-like\ tool\ from\ Google/g' ${CPPLINT_REPORT_DIR}/index.html
+ sed -i.bak 's/http:\/\/cppcheck.sourceforge.net/http:\/\/google\-styleguide.googlecode.com\/svn\/trunk\/cpplint\/cpplint.py/g' ${CPPLINT_REPORT_DIR}/index.html
+ sed -i.bak 's/IRC: <a href=\"irc:\/\/irc.freenode.net\/cppcheck\">irc:\/\/irc.freenode.net\/cppcheck<\/a>/\ /g' ${CPPLINT_REPORT_DIR}/index.html
+
+ echo "-> CppLint finished..."
+}
+
+function run_cpp_check() {
+ echo "-> Cppcheck started..."
+ CPPCHECK_REPORT_DIR="${REPORT_DIR}/cppcheck"
+ if [ -d ${CPPCHECK_REPORT_DIR} ]; then
+ rm -Rf ${CPPCHECK_REPORT_DIR}
+ fi
+
+ mkdir -p ${CPPCHECK_REPORT_DIR}
+
+ # Generate cppcheck xml
+ cppcheck -v --enable=all --suppress=missingIncludeSystem --std=c++11 ${SOURCE_DIR} -i${SOURCE_DIR}/thirdparty -i${SOURCE_DIR}/tests/libs -i${SOURCE_DIR}/temp -i${SOURCE_DIR}/build \
+ -i${SOURCE_DIR}/bin -i${SOURCE_DIR}/report -I${SOURCE_DIR}/include -I${SOURCE_DIR}/src -I${SOURCE_DIR}/thirdparty/pugixml/src -I${SOURCE_DIR}/thirdparty/gflags/src -I${SOURCE_DIR}/samples/scoring_agent/HTTPClient -I${SOURCE_DIR}/src/inference_engine --xml-version=2 2> ${CPPCHECK_REPORT_DIR}/cppcheck-only-result.xml
+
+ # Generate html from it
+ python ${CURRENT_DIR}/cppcheck-htmlreport.py\
+ --file=${CPPCHECK_REPORT_DIR}/cppcheck-only-result.xml\
+ --report-dir=${CPPCHECK_REPORT_DIR}\
+ --source-dir=${SOURCE_DIR}\
+ --title=${PROJECT_NAME}
+ echo "-> Cppcheck finished..."
+}
+
+if [ ! -d ${REPORT_DIR} ]; then
+ mkdir -p ${REPORT_DIR}
+fi
+
+run_cpplint
+
+out_cpp_lint=`cat ${CPPLINT_REPORT_DIR}/cpplint-cppcheck-result.xml`
+if [[ ${out_cpp_lint} == *"error"* ]]; then
+ exit 1
+fi
+#run_cpp_check
add_subdirectory(preprocessing)
+add_subdirectory(ir_readers)
+
add_subdirectory(legacy_api)
if(ENABLE_MKL_DNN)
key_config_map[PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS] = std::to_string(throughput_streams);
key_config_map[PluginConfigParams::KEY_DEVICE_ID] = device_id;
+ key_config_map[PluginConfigParams::KEY_CONFIG_FILE] = "";
}
} // namespace CLDNNPlugin
} else {
size = (cldnn::tensor) cldnn::spatial(TensorValue(poolLayer->_kernel[X_AXIS]), TensorValue(poolLayer->_kernel[Y_AXIS]));
stride = (cldnn::tensor) cldnn::spatial(TensorValue(poolLayer->_stride[X_AXIS]), TensorValue(poolLayer->_stride[Y_AXIS]));
- input_offset = { 0, 0, -TensorValue(allPads.begin[X_AXIS]), -TensorValue(allPads.begin[Y_AXIS]) };
+ input_offset = { 0, 0, -TensorValue(allPads.begin[X_AXIS]), -TensorValue(allPads.begin[Y_AXIS]), 0 };
}
auto dt = DataTypeFromPrecision(poolLayer->outData[0]->getPrecision());
#include <set>
#include <string>
#include <algorithm>
+#include <map>
#if defined __INTEL_COMPILER || defined _MSC_VER
#include <malloc.h>
#include "gna2_model_debug_log.hpp"
#else
#include <gna-api-types-xnn.h>
+#include <map>
#endif
return comp.output_scale_factor;
}
+struct InputEndPoint {
+ int idx = 0;
+ size_t size = 0;
+ size_t num_bytes_per_output = 1;
+ InputEndPoint() = default;
+ InputEndPoint(int nidx, size_t sz, size_t esize) : idx(nidx), size(sz), num_bytes_per_output(esize) {}
+};
void GNAPluginNS::backend::AMIntelDNN::WriteGraphWizModel(const char *filename) {
auto & components = component;
return ptra >= ptrb && ptra < reinterpret_cast<char*>(ptrb) + bsize;
};
+ auto startPtr = [](void* ptr, size_t size) {
+ return reinterpret_cast<int8_t*>(ptr);
+ };
+ auto endPtr = [](void* ptr, size_t size) {
+ return reinterpret_cast<int8_t*>(ptr) + size;
+ };
+ auto sizeofTensor = [](void* ptr, size_t size) {
+ return size;
+ };
+
std::fstream graph(filename, std::ios::out);
graph << "strict digraph {";
std::set<void*> weights;
std::set<void*> biases;
- std::set<void*> outputs;
+ std::map<void*, InputEndPoint> outputs;
std::set<std::string> layersNames;
auto generate_layer_name = [&](int k) {
}
}
if (!inputConnected) {
- // drawing tmp connection
- outputs.insert(components[k].ptr_inputs);
- auto tidx = std::distance(outputs.begin(), outputs.find(components[k].ptr_inputs));
- graph << tidx << " -> " << l
- << " [label=\"FROM_TMP\", fontcolor=darkgreen,color=orange, style=dashed];";
+ // searching for TMP connection
+ size_t tidx = -1;
+ for (auto && en : outputs) {
+ if (intersected(en.first, en.second.size, INPUTS(k))) {
+ tidx = en.second.idx;
+ auto updated_ptr = std::min(startPtr(en.first, en.second.size), startPtr(INPUTS(k)));
+ auto updated_size = std::max(endPtr(en.first, en.second.size), endPtr(INPUTS(k))) - updated_ptr;
+ outputs.erase(en.first);
+ outputs[updated_ptr] = InputEndPoint(tidx, updated_size, components[k].num_bytes_per_input);
+ break;
+ }
+ }
+
+ if (tidx == -1) {
+ outputs[components[k].ptr_inputs] = InputEndPoint(outputs.size(), sizeofTensor(INPUTS(k)), components[k].num_bytes_per_input);
+ }
+ tidx = outputs[components[k].ptr_inputs].idx;
+ graph << "parameter_" << tidx << " -> " << l
+ << " [fontcolor=darkgreen,color=orange, style=dashed];";
}
}
int tidx = 0;
for (auto tmpOutPtrs : outputs) {
- if (components[k].ptr_outputs == tmpOutPtrs) {
+ if (components[k].ptr_outputs == tmpOutPtrs.first) {
graph << l << " -> " << tidx << " [label=\"TO_TMP\", fontcolor=darkgreen,color=orange, style=dashed];";
}
tidx++;
}
}
+ // writing inputs info
+ for (auto && en : outputs) {
+ std::string l = "parameter_" + std::to_string(en.second.idx);
+ graph << l << " [shape=box, style=filled, fillcolor=\"#85C1E9\"";
+ graph << ", label=<<TABLE BORDER=\"0\" CELLBORDER=\"1\" CELLSPACING=\"0\">\n"
+ " <TR><TD colspan=\"2\">" << l << "</TD></TR>\n";
+ graph << " <TR><TD> dims</TD><TD>" << 1 << "x" << en.second.size / en.second.num_bytes_per_output << "</TD></TR>\n";
+ graph << " <TR><TD> obit</TD><TD>" << en.second.num_bytes_per_output << "</TD></TR>\n";
+ graph << " <TR><TD> ptr</TD><TD>" << en.first << "</TD></TR>\n";
+ graph << "</TABLE>>];\n";
+ }
+
graph << "}";
}
std::shared_ptr<GNAPlugin> plg;
public:
- GNAExecutableNetwork(const std::string &aotFileName, const std::map<std::string, std::string> &config) :
- plg(std::make_shared<GNAPlugin>(config)) {
+ GNAExecutableNetwork(const std::string &aotFileName, std::shared_ptr<GNAPlugin> plg)
+ : plg(plg) {
plg->ImportNetwork(aotFileName);
_networkInputs = plg->GetInputs();
_networkOutputs = plg->GetOutputs();
}
- GNAExecutableNetwork(InferenceEngine::ICNNNetwork &network, const std::map<std::string, std::string> &config)
- : plg(std::make_shared<GNAPlugin>(config)) {
+ GNAExecutableNetwork(InferenceEngine::ICNNNetwork &network, std::shared_ptr<GNAPlugin> plg)
+ : plg(plg) {
InferenceEngine::NetPass::ConvertPrecision(network, InferenceEngine::Precision::I64, InferenceEngine::Precision::I32);
InferenceEngine::NetPass::ConvertPrecision(network, InferenceEngine::Precision::U64, InferenceEngine::Precision::I32);
plg->LoadNetwork(network);
}
+ GNAExecutableNetwork(const std::string &aotFileName, const std::map<std::string, std::string> &config)
+ : GNAExecutableNetwork(aotFileName, std::make_shared<GNAPlugin>(config)) {
+ }
+
+ GNAExecutableNetwork(InferenceEngine::ICNNNetwork &network, const std::map<std::string, std::string> &config)
+ : GNAExecutableNetwork(network, std::make_shared<GNAPlugin>(config)) {
+ }
+
InferenceEngine::AsyncInferRequestInternal::Ptr
CreateAsyncInferRequestImpl(InferenceEngine::InputsDataMap networkInputs,
InferenceEngine::OutputsDataMap networkOutputs) override {
void ExportImpl(std::ostream&) override {
THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str;
}
+
+ void GetConfig(const std::string &name,
+ InferenceEngine::Parameter &result,
+ InferenceEngine::ResponseDesc* /*resp*/) const override {
+ result = plg->GetConfig(name, {});
+ }
+
+ void GetMetric(const std::string& name,
+ InferenceEngine::Parameter& result,
+ InferenceEngine::ResponseDesc* /* resp */) const override {
+ result = plg->GetMetric(name, {});
+ }
};
+
} // namespace GNAPluginNS
#include "layers/gna_concat_layer.hpp"
#include "layers/gna_crop_layer.hpp"
#include "round_float_define.hpp"
+#include "gna_plugin_policy.hpp"
using namespace InferenceEngine;
using namespace std;
this->gnaFlags = std::move(gnaFlagsPtr);
}
+void GNAGraphCompiler::setPolicy(GNAPluginNS::Policy policyToSet) {
+ this->policy = policyToSet;
+}
+
intel_dnn_component_t * GNAGraphCompiler::find_first_unused_input(InferenceEngine::CNNLayerPtr current) {
if (current->insData.empty())
return nullptr;
auto outputs = *layer->outData.begin();
auto inputs = layer->insData.begin()->lock();
- // auto offset = filterLayer->GetParamAsInt("output_offset");
-
uint32_t num_columns_in = FROM_IR_DIM(inputs, 2);
uint32_t num_rows_out = FROM_IR_DIM(outputs, 1);
uint32_t num_rows_in = filterLayer->_weights->size() / num_rows_out;
uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in;
+ auto numRowsPadded = filterLayer->GetParamAsInt("num_rows_padded");
+ // number of rows we handled by inserting copy layer
+ uint32_t num_rows_copied = 0;
+ // in case of left alignment succeed, but due to number of elements not multiple of 8 we need to insert align_filter
+ // we are improving it by inserting copy layer of size that covers most of elements - remained max of 32x31 affine filter
+ if (policy.ConcatAlignmentPolicy == Policy::ConcatAlignment::FAST && 0 == numRowsPadded && ALIGN(num_rows_in, 32) > 32) {
+ // can we use copy at all
+ num_rows_copied = ALIGN(num_rows_in, 32) - 32;
+
+ auto orientation = kDnnInterleavedOrientation;
+
+ auto& copyComponent = dnnComponents.addComponent(layer->name + "_synthetic_copy", "copy");
+
+ dnn->InitCopyComponent(copyComponent,
+ orientation,
+ num_rows_copied,
+ num_columns_in,
+ num_rows_copied,
+ num_columns_in,
+ inputs->getPrecision().size(),
+ inputs->getPrecision().size(),
+ quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+ num_rows_copied,
+ num_columns_in,
+ ptr_inputs,
+ ptr_outputs);
+
+
+ size_t num_data_bytes_in = num_rows_copied * num_rows_copied * num_columns_in
+ * inputs->getPrecision().size();
+ // need to reserve full tensor so using original size with assumption of identity activation attached to filter lateron
+ size_t num_data_bytes_out = num_rows_out * num_columns_in * inputs->getPrecision().size();
+
+ connectInput(layer, ptr_inputs, num_data_bytes_in);
+ auto isNonFunctional = [](CNNLayerPtr l) {
+ return LayerInfo(l).isNonFunctional();
+ };
+ auto identity = CNNNetGetNextLayerSkipCertain(layer, 0, 0, isNonFunctional);
+ connectOutput(identity.first, ptr_outputs, num_data_bytes_out);
+
+ num_rows_in -= num_rows_copied;
+ num_rows_out -= num_rows_copied;
+ }
+ filterLayer->params["rows_copied_offset"] = std::to_string(num_rows_copied * inputs->getPrecision().size());
+
+
auto biasPrecision = filterLayer->_biases ? filterLayer->_biases->getTensorDesc().getPrecision() : outputs->getPrecision();
auto& currentComponent = dnnComponents.addComponent(layer->name, "affine");
ptr_biases,
false);
- size_t num_data_bytes_out =
- InferenceEngine::details::product(
- begin(outputs->getDims()), end(outputs->getDims())) * 4;
-
+ size_t num_data_bytes_out = num_rows_out * num_columns_in * outputs->getPrecision().size();
size_t num_data_bytes_in = num_columns_in *
ALIGN(num_rows_in, 8) * inputs->getPrecision().size();
- connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 0);
+ connectInput(layer, ptr_inputs, num_data_bytes_in, num_rows_copied * inputs->getPrecision().size(), 0);
connectOutput(layer, ptr_outputs, num_data_bytes_out);
- if (num_padding == 0) {
- gnamem->readonly().push_ptr(ptr_weights,
- filterLayer->_weights->cbuffer().as<const void*>(),
- filterLayer->_weights->byteSize(),
- 64);
- } else {
+ {
+ auto weightsElementSize = filterLayer->_weights->getTensorDesc().getPrecision().size();
auto elementsIn = (num_rows_in + num_padding) * num_columns_in;
auto paddedWeights = elementsIn * num_rows_out;
- auto paddedWeightsSize = paddedWeights * filterLayer->precision.size();
+ auto paddedWeightsSize = paddedWeights * weightsElementSize;
+
+ // TODO: this can be improved to not generate unneeded weights at all
+
+ size_t weights_stride = (num_rows_in + num_rows_copied) * weightsElementSize;
+ size_t weights_offset = weights_stride * num_rows_copied + num_rows_copied * weightsElementSize;
gnamem->readonly().push_initializer(ptr_weights, paddedWeightsSize, [=](void* data, size_t size) {
- size_t offset = 0;
- for (int i = 0; i < num_rows_out && size >= offset; i++) {
- ie_memcpy(reinterpret_cast<uint8_t*>(data) + offset, size - offset,
- filterLayer->_weights->cbuffer().as<const uint8_t*>() + num_rows_in * i * filterLayer->precision.size(),
- num_rows_in* filterLayer->precision.size());
- offset += (num_rows_in + num_padding) * filterLayer->precision.size();
+ size_t roffset = weights_offset;
+ size_t woffset = 0;
+ for (int i = 0; i < num_rows_out && size >= woffset; i++) {
+ ie_memcpy(reinterpret_cast<uint8_t*>(data) + woffset,
+ size - woffset,
+ filterLayer->_weights->cbuffer().as<const uint8_t*>() + roffset,
+ num_rows_in * weightsElementSize);
+ roffset += weights_stride;
+ woffset += elementsIn * weightsElementSize;
}
- }, 64);
+ }, 64);
}
if (filterLayer->_biases) {
num_rows = FROM_IR_DIM(inputs, 1);
}
- size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->getDims()), end(outputs->getDims()))
- * outputs->getPrecision().size();
+ // TODO: solve this by layer level transformations
+ auto concatAlignFilter = CNNNetPrevLayer(layer, 0);
+ if (LayerInfo(concatAlignFilter).isConcatAlignFilter()) {
+ auto rowsCopiedOffset = concatAlignFilter->GetParamAsInt("rows_copied_offset");
+ if (rowsCopiedOffset != 0) {
+ num_rows -= rowsCopiedOffset / outputs->getPrecision().size();
+ layer->params["output_offset"] = std::to_string(rowsCopiedOffset);
+ }
+ }
- size_t num_data_bytes_in = InferenceEngine::details::product(begin(inputs->getDims()), end(inputs->getDims()))
- * inputs->getPrecision().size();
+ size_t num_data_bytes_out = num_columns * num_rows * outputs->getPrecision().size();
+ size_t num_data_bytes_in = num_columns * num_rows * inputs->getPrecision().size();
static InferenceEngine::details::caseless_unordered_map<std::string, DnnActivationType> supportedActivations = {
{"sigmoid", kActSigmoid},
if (it != splitLayerInfoItem.splitOutputLayers.end()) {
gnalog() << "Connecting " << splitName << " input \n";
- auto res = connectInput(splittingLayer, ptr, splitLayerInfoItem.reserved_size, it->offset, 0);
+ auto res = connectInput(splittingLayer, ptr, splitLayerInfoItem.reserved_size, it->offset + offset, 0);
gnalog() << "Connected \n";
return res;
}
#include "backend/am_intel_dnn.hpp"
#include "gna_device.hpp"
#include "gna_data_types.hpp"
+#include "gna_plugin_policy.hpp"
namespace GNAPluginNS {
class GNAGraphCompiler {
std::shared_ptr<GNAPluginNS::gna_memory_type> gnamem;
std::shared_ptr<GNAPluginNS::InputDesc> inputDesc;
std::shared_ptr<GNAPluginNS::GNAFlags> gnaFlags;
+ Policy policy;
// layers with extra storage for connections and additional
// non trivial processing
void setDNNPtr(std::shared_ptr<GNAPluginNS::backend::AMIntelDNN> dnnPtr);
void setInputDescPtr(std::shared_ptr<GNAPluginNS::InputDesc> inputDescPtr);
void setGNAFlagsPtr(std::shared_ptr<GNAPluginNS::GNAFlags> gnaFlagsPtr);
+ void setPolicy(GNAPluginNS::Policy policy);
void fillMemoryConnections(std::unordered_map<std::string,
std::vector<InferenceEngine::CNNLayerPtr>> &memoryPairs);
auto convert_to_serial = [getOffsetFromBase](const GNAModelSerial::RuntimeEndPoint& ep) {
ModelHeader::EndPoint out;
out.elements_count = ep.elements_count;
- out.element_size = ep.element_size;
out.descriptor_offset = offsetFromBase(ep.descriptor_ptr);
out.scaleFactor = ep.scaleFactor;
return out;
#include <graph_tools.hpp>
#include <debug.h>
#include <gna/gna_config.hpp>
+#include "gna_plugin_config.hpp"
#include <ie_util_internal.hpp>
#include "gna_plugin.hpp"
#include "optimizer/gna_pass_manager.hpp"
GNAPlugin::GNAPlugin() {
Init();
+ UpdateFieldsFromConfig();
}
GNAPlugin::GNAPlugin(const std::map<std::string, std::string>& configMap) {
void GNAPlugin::InitGNADevice() {
#if GNA_LIB_VER == 1
- gnadevice = std::make_shared<GNADeviceHelper>(gna_proc_type,
+ gnadevice = std::make_shared<GNADeviceHelper>(config.gna_proc_type,
gnaFlags->gna_lib_async_threads_num,
gnaFlags->gna_openmp_multithreading,
gnaFlags->performance_counting);
#else
- gnadevice = std::make_shared<GNADeviceHelper>(pluginGna2AccMode,
- pluginGna2DeviceConsistent,
+ gnadevice = std::make_shared<GNADeviceHelper>(config.pluginGna2AccMode,
+ config.pluginGna2DeviceConsistent,
gnaFlags->gna_lib_async_threads_num,
gnaFlags->gna_openmp_multithreading,
gnaFlags->performance_counting);
run_passes(newNet, true);
run_passes(newNet, false);
} else {
- switch (gnaPrecision) {
+ switch (config.gnaPrecision) {
case Precision::I16:
ModelQuantizer<QuantI16> q16;
newNet = q16.quantize(network, run_passes, inputsDesc->inputScaleFactors);
auto sortedNet = CNNNetSortTopologicallyEx(*newNet, make_fuzed_order);
+ // passing policy to compiler
+ graphCompiler.setPolicy(policy);
+
if (sortedNet.empty()) {
THROW_GNA_EXCEPTION << "Sorted network is empty";
}
gnalog() << "[UFS] from : "<< outPort.first <<" reached: " << layer->name << "\n";
+ // probing gna_primitives
if (irLayerAvatar != graphCompiler.dnnComponents.components.end()) {
initOutput(portId, irLayerAvatar->second, layer);
stopSearching = true;
}
+
+ // probing concatInfo
+ if (!stopSearching && LayerInfo(layer).isConcat()) {
+ auto concatConnection = graphCompiler.concat_connection.find(layer->name);
+ if (concatConnection != graphCompiler.concat_connection.end()) {
+ //initOutput(portId, irLayerAvatar->second, layer);
+
+ auto &desc = outputsDesc[portId];
+ auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
+
+ desc.ptrs.resize(gnaFlags->gna_lib_async_threads_num);
+ // TODO: what is orientation for concat
+ desc.orientation = kDnnInterleavedOrientation;
+ desc.num_bytes_per_element = layer->outData.front()->getPrecision().size();
+ desc.scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
+ desc.num_elements = concatConnection->second.reserved_size / desc.num_bytes_per_element;
+
+ // binding ptr for first infer request - then others will be setup during relocation
+ gnamem->bind_ptr(&desc.ptrs.front(), &concatConnection->second.gna_ptr);
+ stopSearching = true;
+ }
+ }
}, true, [&stopSearching](InferenceEngine::CNNLayer* from) {
return make_upstream_order(!stopSearching ? from : nullptr);
});
void GNAPlugin::DumpXNNToFile() const {
// TODO: output precision as well as pointer might be incorrect, LSTM for sure
// gna looks automatically set layer 0 as output and adjust it's pointer / precision/ size respectively
- if (dumpXNNPath.empty()) {
+ if (config.dumpXNNPath.empty()) {
return;
}
- if (dumpXNNGeneration != "GNA1" &&
- dumpXNNGeneration != "GNA3" &&
- !dumpXNNGeneration.empty()) {
- THROW_GNA_EXCEPTION << "Wrong GNA generation for embedded model dump: " << dumpXNNGeneration;
+ if (config.dumpXNNGeneration != "GNA1" &&
+ config.dumpXNNGeneration != "GNA3" &&
+ !config.dumpXNNGeneration.empty()) {
+ THROW_GNA_EXCEPTION << "Wrong GNA generation for embedded model dump: " << config.dumpXNNGeneration;
}
if (!gnadevice) {
THROW_GNA_EXCEPTION << "Cannot generate XNNDump for float network";
}
- std::ofstream dumpStream(dumpXNNPath, std::ios::out | std::ios::binary);
+ std::ofstream dumpStream(config.dumpXNNPath, std::ios::out | std::ios::binary);
#if GNA_LIB_VER == 1
auto dump = gnadevice->dumpXnn(&std::get<0>(nnets.front())->obj, ptr_active_indices, num_active_indices);
dump.header.rw_region_size = gnamem->getRWBytes();
dumpStream.write(reinterpret_cast<char*>(dump.model.get()), dump.header.model_size);
#else
auto const modelId = gnadevice->createModel(std::get<0>(gnaModels.front())->obj);
- if (dumpXNNGeneration != "GNA3") {
+ if (config.dumpXNNGeneration != "GNA3") {
auto dump = gnadevice->dumpXnn(modelId);
dump.header.RwRegionSize = gnamem->getRWBytes();
dump.header.InputScalingFactor = inputsDesc->inputScaleFactors.front();
void GNAPlugin::AddExtension(InferenceEngine::IExtensionPtr extension) {}
-void GNAPlugin::SetConfig(const std::map<std::string, std::string> &config) {
- Init();
- auto supportedConfigOptions = supportedConfigKeys();
-
- for (auto& item : config) {
- auto keys = std::find_if(supportedConfigOptions.begin(), supportedConfigOptions.end(), [&item](const std::string& supportedConfigOption) {
- return item.first == supportedConfigOption ||
- item.first.find(GNA_CONFIG_KEY(SCALE_FACTOR)) == 0;
- });
- if (keys == supportedConfigOptions.end()) {
- THROW_GNA_EXCEPTION << as_status << NOT_FOUND << "Incorrect GNA Plugin config. Key " << item.first << " not supported";
- }
- }
-
- // holds actual value of a found key
- std::string key;
- std::string value;
- auto if_set = [&](const std::string& keyInput, const std::function<void()> & handler) {
- auto keyInMap = config.find(keyInput);
- if (keyInMap != config.end()) {
- value = keyInMap->second;
- handler();
- }
- };
-
- auto if_start = [&](const std::string& keyInput, const std::function<void()> & handler) {
- for (auto && c : config) {
- if (c.first.find(keyInput) == 0) {
- if (c.first.size() > keyInput.size() + 1) {
- key = c.first.substr(keyInput.size() + 1);
- value = c.second;
- handler();
- }
- }
- }
- };
-
- auto fp32eq = [](float p1, float p2) -> bool {
- return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2)));
- };
-
- auto & log = gnalog();
-
- if_start(GNA_CONFIG_KEY(SCALE_FACTOR), [&, this] {
- uint64_t scaleForInput = std::stoul(key, NULL, 10);
- if (scaleForInput > 10) {
- THROW_GNA_EXCEPTION << "input scale factor with index(" << key << ") unsupported";
- }
- auto scaleFactor = std::stod(value);
- if (fp32eq(scaleFactor, 0.0f)) {
- THROW_GNA_EXCEPTION << "input scale factor of 0.0f not supported";
- }
- // not appeared scale factors are to be 1.0f
- if (inputsDesc->inputScaleFactors.size() <= scaleForInput) {
- inputsDesc->inputScaleFactors.resize(scaleForInput + 1, 1.f);
- }
- inputsDesc->inputScaleFactors[scaleForInput] = InferenceEngine::CNNLayer::ie_parse_float(value);
- });
-
- if (inputsDesc->inputScaleFactors.empty()) {
- if_set(GNA_CONFIG_KEY(SCALE_FACTOR), [&] {
- auto scaleFactor = InferenceEngine::CNNLayer::ie_parse_float(value);
- if (fp32eq(scaleFactor, 0.0f)) {
- THROW_GNA_EXCEPTION << "input scale factor of 0.0f not supported";
- }
- inputsDesc->inputScaleFactors.push_back(scaleFactor);
- });
- }
-
- if (inputsDesc->inputScaleFactors.empty()) {
- inputsDesc->inputScaleFactors.push_back(1.f);
- }
-
- if_set(GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE), [&] {
- dumpXNNPath = value;
- });
-
- if_set(GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE_GENERATION), [&] {
- dumpXNNGeneration = value;
- });
-
- if_set(GNA_CONFIG_KEY(DEVICE_MODE), [&] {
-#if GNA_LIB_VER == 1
- static caseless_unordered_map <std::string, uint32_t> supported_values = {
- {GNAConfigParams::GNA_AUTO, GNA_AUTO},
- {GNAConfigParams::GNA_HW, GNA_HARDWARE},
- {GNAConfigParams::GNA_SW, GNA_SOFTWARE},
- {GNAConfigParams::GNA_SW_EXACT, GNA_SOFTWARE & GNA_HARDWARE}
- };
- static std::vector <std::string> supported_values_on_gna2 = {
- GNAConfigParams::GNA_GEN,
- GNAConfigParams::GNA_GEN_EXACT,
- GNAConfigParams::GNA_SSE,
- GNAConfigParams::GNA_SSE_EXACT,
- GNAConfigParams::GNA_AVX1,
- GNAConfigParams::GNA_AVX1_EXACT,
- GNAConfigParams::GNA_AVX2,
- GNAConfigParams::GNA_AVX2_EXACT
- };
-#else
- static caseless_unordered_map <std::string, std::pair<Gna2AccelerationMode, Gna2DeviceVersion> > supported_values = {
- {GNAConfigParams::GNA_AUTO, {Gna2AccelerationModeAuto, Gna2DeviceVersionSoftwareEmulation}},
- {GNAConfigParams::GNA_HW, {Gna2AccelerationModeHardware, Gna2DeviceVersionSoftwareEmulation}},
- {GNAConfigParams::GNA_SW, {Gna2AccelerationModeSoftware, Gna2DeviceVersionSoftwareEmulation}},
- {GNAConfigParams::GNA_SW_EXACT, {Gna2AccelerationModeSoftware, Gna2DeviceVersion1_0}},
- {GNAConfigParams::GNA_GEN, {Gna2AccelerationModeGeneric, Gna2DeviceVersionSoftwareEmulation}},
- {GNAConfigParams::GNA_GEN_EXACT, {Gna2AccelerationModeGeneric, Gna2DeviceVersion1_0}},
- {GNAConfigParams::GNA_SSE, {Gna2AccelerationModeSse4x2, Gna2DeviceVersionSoftwareEmulation}},
- {GNAConfigParams::GNA_SSE_EXACT, {Gna2AccelerationModeSse4x2, Gna2DeviceVersion1_0}},
- {GNAConfigParams::GNA_AVX1, {Gna2AccelerationModeAvx1, Gna2DeviceVersionSoftwareEmulation}},
- {GNAConfigParams::GNA_AVX1_EXACT, {Gna2AccelerationModeAvx1, Gna2DeviceVersion1_0}},
- {GNAConfigParams::GNA_AVX2, {Gna2AccelerationModeAvx2, Gna2DeviceVersionSoftwareEmulation}},
- {GNAConfigParams::GNA_AVX2_EXACT, {Gna2AccelerationModeAvx2, Gna2DeviceVersion1_0}},
- };
-#endif
- auto procType = supported_values.find(value);
- if (procType == supported_values.end()) {
- if (value == GNA_CONFIG_VALUE(SW_FP32)) {
- gnaFlags->sw_fp32 = true;
- } else {
-#if GNA_LIB_VER == 1
- auto is_gna2_mode = std::find(
- supported_values_on_gna2.begin(),
- supported_values_on_gna2.end(),
- value);
- if (is_gna2_mode != supported_values_on_gna2.end()) {
- THROW_GNA_EXCEPTION << "This GNA device mode require GNA2 library: " << value;
- }
-#endif
- THROW_GNA_EXCEPTION << "GNA device mode unsupported: " << value;
- }
- } else {
-#if GNA_LIB_VER == 1
- gna_proc_type = static_cast<intel_gna_proc_t>(procType->second);
-#else
- pluginGna2AccMode = procType->second.first;
- pluginGna2DeviceConsistent = procType->second.second;
-#endif
- }
- });
-
- if_set(GNA_CONFIG_KEY(COMPACT_MODE), [&] {
- if (value == PluginConfigParams::YES) {
- gnaFlags->compact_mode = true;
- } else if (value == PluginConfigParams::NO) {
- gnaFlags->compact_mode = false;
- } else {
- log << "GNA compact mode should be YES/NO, but not" << value;
- THROW_GNA_EXCEPTION << "GNA compact mode should be YES/NO, but not" << value;
- }
- });
-
- if_set(CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), [&] {
- if (value == PluginConfigParams::YES) {
- gnaFlags->exclusive_async_requests = true;
- } else if (value == PluginConfigParams::NO) {
- gnaFlags->exclusive_async_requests = false;
- } else {
- log << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
- THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
- }
- });
-
- if_set(GNA_CONFIG_KEY(PRECISION), [&] {
- auto precision = Precision::FromStr(value);
- if (precision != Precision::I8 && precision != Precision::I16) {
- log << "Unsupported precision of GNA hardware, should be Int16 or Int8, but was: " << value;
- THROW_GNA_EXCEPTION << "Unsupported precision of GNA hardware, should be Int16 or Int8, but was: " << value;
- }
- gnaPrecision = precision;
- });
-
- if_set(GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN), [&] {
- if (value == PluginConfigParams::YES) {
- gnaFlags->uniformPwlDesign = true;
- } else if (value == PluginConfigParams::NO) {
- gnaFlags->uniformPwlDesign = false;
- } else {
- log << "GNA pwl uniform algorithm parameter "
- << "should be equal to YES/NO, but not" << value;
- THROW_GNA_EXCEPTION << "GNA pwl uniform algorithm parameter "
- << "should be equal to YES/NO, but not" << value;
- }
- });
-
- if_set(CONFIG_KEY(PERF_COUNT), [&] {
- if (value == PluginConfigParams::YES) {
- gnaFlags->performance_counting = true;
- } else if (value == PluginConfigParams::NO) {
- gnaFlags->performance_counting = false;
- } else {
- log << "GNA performance counter enabling parameter "
- << "should be equal to YES/NO, but not" << value;
- THROW_GNA_EXCEPTION << "GNA performance counter enabling parameter "
- << "should be equal to YES/NO, but not" << value;
- }
- });
-
- if_set(GNA_CONFIG_KEY(LIB_N_THREADS), [&] {
- uint64_t lib_threads = std::stoul(value, NULL, 10);
- if (lib_threads == 0 || lib_threads > std::numeric_limits<uint8_t>::max()/2-1) {
- log << "Unsupported accelerator lib number of threads: " << value << ", should be greateer than 0 and less than 127";
- THROW_GNA_EXCEPTION << "Unsupported accelerator lib number of threads: " << value
- << ", should be greateer than 0 and less than 127";
- }
- gnaFlags->gna_lib_async_threads_num = lib_threads;
- });
-
- if_set(CONFIG_KEY(SINGLE_THREAD), [&] {
- if (value == PluginConfigParams::YES) {
- gnaFlags->gna_openmp_multithreading = false;
- } else if (value == PluginConfigParams::NO) {
- gnaFlags->gna_openmp_multithreading = true;
- } else {
- log << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
- THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
- }
- });
+void GNAPlugin::SetConfig(const std::map<std::string, std::string> &config_map) {
+ config.UpdateFromMap(config_map);
+ UpdateFieldsFromConfig();
+}
- if (gnaFlags->sw_fp32 && gnaFlags->gna_lib_async_threads_num > 1) {
- THROW_GNA_EXCEPTION << "GNA plugin not support async mode on GNA_SW_FP32!";
- }
+void GNAPlugin::UpdateFieldsFromConfig() {
+ inputsDesc->inputScaleFactors = config.inputScaleFactors;
+ *gnaFlags = config.gnaFlags;
}
void GNAPlugin::QueryNetwork(const InferenceEngine::ICNNNetwork& network,
#include "gna_graph_compiler.hpp"
#include "gna_plugin_policy.hpp"
#include "gna_plugin_log.hpp"
+#include "gna_plugin_config.hpp"
#if GNA_LIB_VER == 2
#include <gna2-model-api.h>
protected:
std::string _pluginName = "GNA";
+ Config config;
std::shared_ptr<GNAPluginNS::backend::AMIntelDNN> dnn;
std::shared_ptr<GNAPluginNS::GNAFlags> gnaFlags;
std::shared_ptr<GNAPluginNS::gna_memory_type> gnamem;
// index matches iterating order of cnnnetwork outputs info
std::vector<GNAPluginNS::OutputDesc> outputsDesc = std::vector<OutputDesc>();
- // precision of GNA hardware model
- InferenceEngine::Precision gnaPrecision = InferenceEngine::Precision::I16;
-
intel_dnn_number_type_t output_type = kDnnInt;
GNAPluginNS::Policy policy;
- std::string dumpXNNPath;
- std::string dumpXNNGeneration;
-#if GNA_LIB_VER == 1
- intel_gna_proc_t gna_proc_type = static_cast<intel_gna_proc_t>(GNA_SOFTWARE & GNA_HARDWARE);
-#else
- Gna2AccelerationMode pluginGna2AccMode = Gna2AccelerationModeSoftware;
-Gna2DeviceVersion pluginGna2DeviceConsistent = Gna2DeviceVersion1_0;
-void createRequestConfigsForGnaModels();
+
+#if GNA_LIB_VER == 2
+ void createRequestConfigsForGnaModels();
#endif
std::shared_ptr<GNADeviceHelper> gnadevice;
void GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap);
void AddExtension(InferenceEngine::IExtensionPtr extension) override;
- std::vector<std::string> supportedConfigKeys() const;
- std::map<std::string, std::string> supportedConfigKeysWithDefaults() const;
-
void SetConfig(const std::map<std::string, std::string> &config) override;
void LoadNetwork(InferenceEngine::IExecutableNetwork::Ptr &executableNetwork,
const InferenceEngine::ICNNNetwork &network,
- const std::map<std::string, std::string> &config) override { THROW_GNA_EXCEPTION << "Not implemented"; }
+ const std::map<std::string, std::string> &config_map) override { THROW_GNA_EXCEPTION << "Not implemented"; }
InferenceEngine::ExecutableNetwork LoadNetwork(const InferenceEngine::ICNNNetwork &network,
- const std::map<std::string, std::string> &config,
+ const std::map<std::string, std::string> &config_map,
InferenceEngine::RemoteContext::Ptr context) override { THROW_GNA_EXCEPTION << "Not implemented"; }
void Infer(const InferenceEngine::Blob &input, InferenceEngine::Blob &result);
void SetCore(InferenceEngine::ICore*) noexcept override {}
const GNASplitLayer& splitInfo,
size_t precision_size,
int idx = 0);
+
+ void UpdateFieldsFromConfig();
};
+
} // namespace GNAPluginNS
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gna/gna_config.hpp>
+#include "gna_plugin.hpp"
+#include "gna_plugin_config.hpp"
+#include "ie_common.h"
+#include <details/caseless.hpp>
+#include <unordered_map>
+
+using namespace InferenceEngine;
+using namespace InferenceEngine::details;
+
+namespace GNAPluginNS {
+
+#if GNA_LIB_VER == 1
+static caseless_unordered_map<std::string, uint32_t> supported_values = {
+ {GNAConfigParams::GNA_AUTO, GNA_AUTO},
+ {GNAConfigParams::GNA_HW, GNA_HARDWARE},
+ {GNAConfigParams::GNA_SW, GNA_SOFTWARE},
+ {GNAConfigParams::GNA_SW_EXACT, GNA_SOFTWARE & GNA_HARDWARE}
+};
+static std::vector<std::string> supported_values_on_gna2 = {
+ GNAConfigParams::GNA_GEN,
+ GNAConfigParams::GNA_GEN_EXACT,
+ GNAConfigParams::GNA_SSE,
+ GNAConfigParams::GNA_SSE_EXACT,
+ GNAConfigParams::GNA_AVX1,
+ GNAConfigParams::GNA_AVX1_EXACT,
+ GNAConfigParams::GNA_AVX2,
+ GNAConfigParams::GNA_AVX2_EXACT
+};
+#else
+static caseless_unordered_map <std::string, std::pair<Gna2AccelerationMode, Gna2DeviceVersion>> supported_values = {
+ {GNAConfigParams::GNA_AUTO, {Gna2AccelerationModeAuto, Gna2DeviceVersionSoftwareEmulation}},
+ {GNAConfigParams::GNA_HW, {Gna2AccelerationModeHardware, Gna2DeviceVersionSoftwareEmulation}},
+ {GNAConfigParams::GNA_SW, {Gna2AccelerationModeSoftware, Gna2DeviceVersionSoftwareEmulation}},
+ {GNAConfigParams::GNA_SW_EXACT, {Gna2AccelerationModeSoftware, Gna2DeviceVersion1_0}},
+ {GNAConfigParams::GNA_GEN, {Gna2AccelerationModeGeneric, Gna2DeviceVersionSoftwareEmulation}},
+ {GNAConfigParams::GNA_GEN_EXACT, {Gna2AccelerationModeGeneric, Gna2DeviceVersion1_0}},
+ {GNAConfigParams::GNA_SSE, {Gna2AccelerationModeSse4x2, Gna2DeviceVersionSoftwareEmulation}},
+ {GNAConfigParams::GNA_SSE_EXACT, {Gna2AccelerationModeSse4x2, Gna2DeviceVersion1_0}},
+ {GNAConfigParams::GNA_AVX1, {Gna2AccelerationModeAvx1, Gna2DeviceVersionSoftwareEmulation}},
+ {GNAConfigParams::GNA_AVX1_EXACT, {Gna2AccelerationModeAvx1, Gna2DeviceVersion1_0}},
+ {GNAConfigParams::GNA_AVX2, {Gna2AccelerationModeAvx2, Gna2DeviceVersionSoftwareEmulation}},
+ {GNAConfigParams::GNA_AVX2_EXACT, {Gna2AccelerationModeAvx2, Gna2DeviceVersion1_0}},
+ };
+#endif
+
+void Config::UpdateFromMap(const std::map<std::string, std::string>& config) {
+ for (auto&& item : config) {
+ auto key = item.first;
+ auto value = item.second;
+
+ auto fp32eq = [](float p1, float p2) -> bool {
+ return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2)));
+ };
+
+ auto &log = gnalog();
+
+ if (key.find(GNA_CONFIG_KEY(SCALE_FACTOR)) == 0) {
+ uint64_t input_index;
+ if (key == GNA_CONFIG_KEY(SCALE_FACTOR)) {
+ input_index = 0;
+ } else {
+ key.erase(0, strlen(GNA_CONFIG_KEY(SCALE_FACTOR)));
+ if (key[0] != '_') {
+ THROW_GNA_EXCEPTION << "Invalid format of scale factor configuration key";
+ }
+ key.erase(0, 1);
+ try {
+ input_index = std::stoi(key);
+ if (input_index < 0 | input_index > 9) {
+ throw std::out_of_range("");
+ }
+ } catch (std::invalid_argument&) {
+ THROW_GNA_EXCEPTION << "Invalid value of index of input scale factor";
+ } catch (std::out_of_range&) {
+ THROW_GNA_EXCEPTION << "Index of input scale factor must be in the range [0..9], " << value << " provided";
+ }
+ }
+ auto scale_factor = InferenceEngine::CNNLayer::ie_parse_float(value);
+ if (fp32eq(scale_factor, 0.0f)) {
+ THROW_GNA_EXCEPTION << "input scale factor of 0.0f not supported";
+ }
+ // missing scale factors are set to be 1.0f
+ if (inputScaleFactors.size() <= input_index) {
+ inputScaleFactors.resize(input_index + 1, 1.f);
+ }
+ inputScaleFactors[input_index] = InferenceEngine::CNNLayer::ie_parse_float(value);
+ } else if (key == GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE)) {
+ dumpXNNPath = value;
+ } else if (key == GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE_GENERATION)) {
+ dumpXNNGeneration = value;
+ } else if (key == GNA_CONFIG_KEY(DEVICE_MODE)) {
+ auto procType = supported_values.find(value);
+ if (procType == supported_values.end()) {
+ if (value == GNA_CONFIG_VALUE(SW_FP32)) {
+ gnaFlags.sw_fp32 = true;
+ } else {
+#if GNA_LIB_VER == 1
+ auto is_gna2_mode = std::find(
+ supported_values_on_gna2.begin(),
+ supported_values_on_gna2.end(),
+ value);
+ if (is_gna2_mode != supported_values_on_gna2.end()) {
+ THROW_GNA_EXCEPTION << "This GNA device mode requires GNA2 library: " << value;
+ }
+#endif
+ THROW_GNA_EXCEPTION << "GNA device mode unsupported: " << value;
+ }
+ } else {
+#if GNA_LIB_VER == 1
+ gna_proc_type = static_cast<intel_gna_proc_t>(procType->second);
+#else
+ pluginGna2AccMode = procType->second.first;
+ pluginGna2DeviceConsistent = procType->second.second;
+#endif
+ }
+ } else if (key == GNA_CONFIG_KEY(COMPACT_MODE)) {
+ if (value == PluginConfigParams::YES) {
+ gnaFlags.compact_mode = true;
+ } else if (value == PluginConfigParams::NO) {
+ gnaFlags.compact_mode = false;
+ } else {
+ log << "GNA compact mode should be YES/NO, but not " << value;
+ THROW_GNA_EXCEPTION << "GNA compact mode should be YES/NO, but not " << value;
+ }
+ } else if (key == CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS)) {
+ if (value == PluginConfigParams::YES) {
+ gnaFlags.exclusive_async_requests = true;
+ } else if (value == PluginConfigParams::NO) {
+ gnaFlags.exclusive_async_requests = false;
+ } else {
+ log << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
+ THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
+ }
+ } else if (key == GNA_CONFIG_KEY(PRECISION)) {
+ auto precision = Precision::FromStr(value);
+ if (precision != Precision::I8 && precision != Precision::I16) {
+ log << "Unsupported precision of GNA hardware, should be Int16 or Int8, but was: " << value;
+ THROW_GNA_EXCEPTION << "Unsupported precision of GNA hardware, should be Int16 or Int8, but was: "
+ << value;
+ }
+ gnaPrecision = precision;
+ } else if (key == GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN)) {
+ if (value == PluginConfigParams::YES) {
+ gnaFlags.uniformPwlDesign = true;
+ } else if (value == PluginConfigParams::NO) {
+ gnaFlags.uniformPwlDesign = false;
+ } else {
+ log << "GNA pwl uniform algorithm parameter "
+ << "should be equal to YES/NO, but not" << value;
+ THROW_GNA_EXCEPTION << "GNA pwl uniform algorithm parameter "
+ << "should be equal to YES/NO, but not" << value;
+ }
+ } else if (key == CONFIG_KEY(PERF_COUNT)) {
+ if (value == PluginConfigParams::YES) {
+ gnaFlags.performance_counting = true;
+ } else if (value == PluginConfigParams::NO) {
+ gnaFlags.performance_counting = false;
+ } else {
+ log << "GNA performance counter enabling parameter "
+ << "should be equal to YES/NO, but not" << value;
+ THROW_GNA_EXCEPTION << "GNA performance counter enabling parameter "
+ << "should be equal to YES/NO, but not" << value;
+ }
+ } else if (key == GNA_CONFIG_KEY(LIB_N_THREADS)) {
+ uint64_t lib_threads;
+ try {
+ lib_threads = std::stoul(value);
+ if (lib_threads == 0 || lib_threads > (std::numeric_limits<uint8_t>::max()+1) / 2 - 1) {
+ throw std::out_of_range("");
+ }
+ } catch (std::invalid_argument&) {
+ THROW_GNA_EXCEPTION << "Invalid value of number of threads";
+ } catch (std::out_of_range&) {
+ log << "Unsupported accelerator lib number of threads: " << value
+ << ", should be greater than 0 and less than 127";
+ THROW_GNA_EXCEPTION << "Unsupported accelerator lib number of threads: " << value
+ << ", should be greater than 0 and less than 127";
+ }
+ gnaFlags.gna_lib_async_threads_num = lib_threads;
+ } else if (key == CONFIG_KEY(SINGLE_THREAD)) {
+ if (value == PluginConfigParams::YES) {
+ gnaFlags.gna_openmp_multithreading = false;
+ } else if (value == PluginConfigParams::NO) {
+ gnaFlags.gna_openmp_multithreading = true;
+ } else {
+ log << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
+ THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
+ }
+ } else {
+ THROW_GNA_EXCEPTION << as_status << NOT_FOUND << "Incorrect GNA Plugin config. Key " << item.first
+ << " not supported";
+ }
+
+ if (gnaFlags.sw_fp32 && gnaFlags.gna_lib_async_threads_num > 1) {
+ THROW_GNA_EXCEPTION << "GNA plugin does not support async mode on GNA_SW_FP32!";
+ }
+ }
+
+ if (inputScaleFactors.empty()) {
+ inputScaleFactors.push_back(1.0f);
+ }
+
+ AdjustKeyMapValues();
+}
+
+void Config::AdjustKeyMapValues() {
+ key_config_map.clear();
+
+ if (inputScaleFactors.empty()) {
+ inputScaleFactors.push_back(1.0);
+ }
+ key_config_map[GNA_CONFIG_KEY(SCALE_FACTOR)] = std::to_string(inputScaleFactors[0]);
+ for (int n = 0; n < inputScaleFactors.size(); n++) {
+ key_config_map[GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("_") + std::to_string(n)] =
+ std::to_string(inputScaleFactors[n]);
+ }
+ key_config_map[GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE)] = dumpXNNPath;
+ key_config_map[GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE_GENERATION)] = dumpXNNGeneration;
+
+ std::string device_mode;
+ if (gnaFlags.sw_fp32) {
+ device_mode = GNA_CONFIG_VALUE(SW_FP32);
+ } else {
+ for (auto&& value : supported_values) {
+#if GNA_LIB_VER == 1
+ if (value.second == gna_proc_type) {
+ device_mode = value.first;
+ break;
+ }
+#else
+ if (value.second.first == pluginGna2AccMode &&
+ value.second.second == pluginGna2DeviceConsistent) {
+ device_mode = value.first;
+ break;
+ }
+#endif
+ }
+ }
+ IE_ASSERT(!device_mode.empty());
+ key_config_map[GNA_CONFIG_KEY(DEVICE_MODE)] = device_mode;
+ key_config_map[GNA_CONFIG_KEY(COMPACT_MODE)] =
+ gnaFlags.compact_mode ? PluginConfigParams::YES: PluginConfigParams::NO;
+ key_config_map[CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS)] =
+ gnaFlags.exclusive_async_requests ? PluginConfigParams::YES: PluginConfigParams::NO;
+ key_config_map[GNA_CONFIG_KEY(PRECISION)] = gnaPrecision.name();
+ key_config_map[CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS)] =
+ gnaFlags.exclusive_async_requests ? PluginConfigParams::YES: PluginConfigParams::NO;
+ key_config_map[GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN)] =
+ gnaFlags.uniformPwlDesign ? PluginConfigParams::YES: PluginConfigParams::NO;
+ key_config_map[CONFIG_KEY(PERF_COUNT)] =
+ gnaFlags.performance_counting ? PluginConfigParams::YES: PluginConfigParams::NO;
+ key_config_map[GNA_CONFIG_KEY(LIB_N_THREADS)] = std::to_string(gnaFlags.gna_lib_async_threads_num);
+ key_config_map[CONFIG_KEY(SINGLE_THREAD)] =
+ gnaFlags.gna_openmp_multithreading ? PluginConfigParams::NO: PluginConfigParams::YES;
+}
+
+std::string Config::GetParameter(const std::string& name) const {
+ auto result = key_config_map.find(name);
+ if (result == key_config_map.end()) {
+ THROW_GNA_EXCEPTION << "Unsupported config key: " << name;
+ }
+ return result->second;
+}
+
+std::vector<std::string> Config::GetSupportedKeys() const {
+ std::vector<std::string> result;
+ for (auto&& configOption : key_config_map) {
+ result.push_back(configOption.first);
+ }
+ return result;
+}
+
+} // namespace GNAPluginNS
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#if GNA_LIB_VER == 1
+#include <gna-api.h>
+#else
+#include <gna2-inference-api.h>
+#include <gna2-common-api.h>
+#endif
+#include "ie_precision.hpp"
+#include "descriptions/gna_flags.hpp"
+#include <vector>
+#include <map>
+
+namespace GNAPluginNS {
+
+struct Config {
+ Config() {
+ AdjustKeyMapValues();
+ }
+ void UpdateFromMap(const std::map<std::string, std::string>& configMap);
+ void AdjustKeyMapValues();
+ std::string GetParameter(const std::string& name) const;
+ std::vector<std::string> GetSupportedKeys() const;
+
+ // precision of GNA hardware model
+ InferenceEngine::Precision gnaPrecision = InferenceEngine::Precision::I16;
+
+ std::string dumpXNNPath;
+ std::string dumpXNNGeneration;
+
+#if GNA_LIB_VER == 1
+ intel_gna_proc_t gna_proc_type = static_cast<intel_gna_proc_t>(GNA_SOFTWARE & GNA_HARDWARE);
+#else
+ Gna2AccelerationMode pluginGna2AccMode = Gna2AccelerationModeSoftware;
+ Gna2DeviceVersion pluginGna2DeviceConsistent = Gna2DeviceVersion1_0;
+#endif
+
+ std::vector<float> inputScaleFactors;
+ GNAFlags gnaFlags;
+
+ std::map<std::string, std::string> key_config_map;
+};
+
+} // namespace GNAPluginNS
#include <cpp_interfaces/impl/ie_plugin_internal.hpp>
#include <cpp_interfaces/impl/ie_executable_network_internal.hpp>
#include "gna_executable_network.hpp"
+#include "gna_plugin_config.hpp"
namespace GNAPluginNS {
class GNAPluginInternal : public InferenceEngine::InferencePluginInternal {
- public:
+private:
+ Config defaultConfig;
+ std::weak_ptr <GNAPlugin> plgPtr;
+ std::shared_ptr<GNAPlugin> GetCurrentPlugin() const {
+ auto ptr = plgPtr.lock();
+ if (ptr == nullptr) {
+ return std::make_shared<GNAPlugin>();
+ } else {
+ return ptr;
+ }
+ }
+
+public:
InferenceEngine::ExecutableNetworkInternal::Ptr LoadExeNetworkImpl(const InferenceEngine::ICore * core,
const InferenceEngine::ICNNNetwork &network,
const std::map<std::string, std::string> &config) override {
- return std::make_shared<GNAExecutableNetwork>(*cloneNet(network), config);
+ Config updated_config(defaultConfig);
+ updated_config.UpdateFromMap(config);
+ auto plg = std::make_shared<GNAPlugin>(updated_config.key_config_map);
+ plgPtr = plg;
+ return std::make_shared<GNAExecutableNetwork>(*cloneNet(network), plg);
}
+
void SetConfig(const std::map<std::string, std::string> &config) override {
- auto plg = std::make_shared<GNAPlugin>();
- plg->SetConfig(config);
+ defaultConfig.UpdateFromMap(config);
}
+
InferenceEngine::IExecutableNetwork::Ptr ImportNetwork(
const std::string &modelFileName,
const std::map<std::string, std::string> &config) override {
- return make_executable_network(std::make_shared<GNAExecutableNetwork>(modelFileName, config));
+ Config updated_config(defaultConfig);
+ updated_config.UpdateFromMap(config);
+ auto plg = std::make_shared<GNAPlugin>(updated_config.key_config_map);
+ plgPtr = plg;
+ return make_executable_network(std::make_shared<GNAExecutableNetwork>(modelFileName, plg));
}
+
using InferenceEngine::InferencePluginInternal::ImportNetwork;
std::string GetName() const noexcept override {
- auto plg = std::make_shared<GNAPlugin>();
- return plg->GetName();
+ return GetCurrentPlugin()->GetName();
}
void QueryNetwork(const InferenceEngine::ICNNNetwork& network,
const std::map<std::string, std::string>& config,
InferenceEngine::QueryNetworkResult& res) const override {
- auto plg = std::make_shared<GNAPlugin>();
+ auto plg = GetCurrentPlugin();
try {
plg->SetConfig(config);
} catch (InferenceEngine::details::InferenceEngineException) {}
InferenceEngine::Parameter GetMetric(const std::string& name,
const std::map<std::string, InferenceEngine::Parameter> & options) const override {
- GNAPlugin statelessPlugin;
- return statelessPlugin.GetMetric(name, options);
+ return GetCurrentPlugin()->GetMetric(name, options);
}
InferenceEngine::Parameter GetConfig(const std::string& name, const std::map<std::string, InferenceEngine::Parameter> & options) const override {
- GNAPlugin statelessPlugin;
- return statelessPlugin.GetConfig(name, options);
+ return defaultConfig.GetParameter(name);
}
};
enum class ConcatAlignment {
DISABLED,
DISABLED_FOR_FP32,
- ENABLED
- } ConcatAlignmentPolicy = ConcatAlignment::ENABLED;
+ ENABLED,
+ FAST
+ } ConcatAlignmentPolicy = ConcatAlignment::FAST;
};
inline std::ostream& operator<<(std::ostream& os, Policy::ScaleShift policy) {
return os;
}
+inline std::ostream& operator<<(std::ostream& os, Policy::ConcatAlignment policy) {
+ switch (policy) {
+ case Policy::ConcatAlignment::DISABLED : os << "DISABLED"; break;
+ case Policy::ConcatAlignment::DISABLED_FOR_FP32 : os << "DISABLED_FOR_FP32"; break;
+ case Policy::ConcatAlignment::ENABLED : os << "ENABLED"; break;
+ case Policy::ConcatAlignment::FAST : os << "FAST"; break;
+ default : os.setstate(std::ios_base::failbit);
+ }
+ return os;
+}
+
+
} // namespace GNAPluginNS
using namespace InferenceEngine;
using namespace InferenceEngine::PluginConfigParams;
-Parameter GNAPlugin::GetConfig(const std::string& name, const std::map<std::string, Parameter> & options) const {
- auto configKeys = supportedConfigKeysWithDefaults();
- auto result = configKeys.find(name);
- if (result == configKeys.end()) {
- THROW_GNA_EXCEPTION << "unsupported config key: " << name;
- }
- return result->second;
+Parameter GNAPlugin::GetConfig(const std::string& name, const std::map<std::string, Parameter> & /*options*/) const {
+ return config.GetParameter(name);
}
Parameter GNAPlugin::GetMetric(const std::string& name, const std::map<std::string, InferenceEngine::Parameter> & options) const {
const std::unordered_map<std::string, std::function<Parameter()>> queryApiSupported = {
{METRIC_KEY(AVAILABLE_DEVICES), [this]() {return GetAvailableDevices();}},
- {METRIC_KEY(SUPPORTED_CONFIG_KEYS), [this]() {return supportedConfigKeys();}},
+ {METRIC_KEY(SUPPORTED_CONFIG_KEYS), [this]() {return config.GetSupportedKeys();}},
{METRIC_KEY(FULL_DEVICE_NAME), [&options, this]() {
auto availableDevices = GetAvailableDevices().as<std::vector<std::string>>();
return devices;
}
-
-std::map<std::string, std::string> GNAPlugin::supportedConfigKeysWithDefaults() const {
- std::map<std::string, std::string> options = {
- {GNA_CONFIG_KEY(SCALE_FACTOR), "1.0"},
- {GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE), ""},
- {GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE_GENERATION), ""},
- {GNA_CONFIG_KEY(DEVICE_MODE), GNAConfigParams::GNA_AUTO},
- {GNA_CONFIG_KEY(COMPACT_MODE), CONFIG_VALUE(NO)},
- {CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), CONFIG_VALUE(NO)},
- {GNA_CONFIG_KEY(PRECISION), Precision(Precision::I8).name()},
- {GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN), CONFIG_VALUE(YES)},
- {CONFIG_KEY(PERF_COUNT), CONFIG_VALUE(NO)},
- {GNA_CONFIG_KEY(LIB_N_THREADS), "1"},
- {CONFIG_KEY(SINGLE_THREAD), CONFIG_VALUE(YES)}
- };
- return options;
-}
-
-
-std::vector<std::string> GNAPlugin::supportedConfigKeys()const {
- std::vector<std::string> result;
- for (auto && configOption : supportedConfigKeysWithDefaults()) {
- result.push_back(configOption.first);
- }
- return result;
-}
void InsertConcatAligningFilterPass::run() {
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(pLayers->front());
+
+ if (getPassManager()->getPolicy().ConcatAlignmentPolicy == Policy::ConcatAlignment::DISABLED) {
+ return;
+ }
// aligning specific not required in fp32 mode
if (getPassManager()->getPolicy().ConcatAlignmentPolicy == Policy::ConcatAlignment::DISABLED_FOR_FP32 && !quantized) {
return;
// encodes offset to beginning of split layer input
concatAligningFilter->params["output_offset"] =
std::to_string((aligned64_offset / bytesPerConcatElement) * (quantized ? bytesPerConcatElement : 4));
+
+ // for padded rows we cannot use copy layer - TBD how to implement
+ concatAligningFilter->params["num_rows_padded"] = std::to_string(num_rows_padded);
+
// encodes original output size
concatAligningFilter->params["original_num_rows"] = std::to_string(num_rows_in);
saveGraphToDot(*network.get(), out, [](const CNNLayerPtr layer,
ordered_properties &printed_properties,
ordered_properties &node_properties) {});
- network->serialize(name + ".xml", "", nullptr);
+ network->serialize(name + ".xml", name + ".bin", nullptr);
};
#else
auto dumpNetworkAfterPass = [] (std::shared_ptr<Pass> ) {};
void HeteroAsyncInferRequest::StartAsync_ThreadUnsafe() {
_heteroInferRequest->updateInOutIfNeeded();
- RunFirstStage();
+ RunFirstStage(_pipeline.begin(), _pipeline.end());
}
StatusCode HeteroAsyncInferRequest::Wait(int64_t millis_timeout) {
}
}
+void HeteroInferRequest::SetBlob(const char* name, const InferenceEngine::Blob::Ptr& data) {
+ InferenceEngine::InferRequestInternal::SetBlob(name, data);
+ assert(!_inferRequests.empty());
+ for (auto &&desc : _inferRequests) {
+ auto &r = desc._request;
+ assert(nullptr != r);
+ InputInfo::Ptr foundInput;
+ DataPtr foundOutput;
+ try {
+ // if `name` is input blob
+ if (findInputAndOutputBlobByName(name, foundInput, foundOutput)) {
+ r->SetBlob(name, data, foundInput->getPreProcess());
+ }
+ } catch (const InferenceEngine::details::InferenceEngineException & ex) {
+ std::string message = ex.what();
+ if (message.find(NOT_FOUND_str) == std::string::npos)
+ throw ex;
+ }
+ }
+}
+
void HeteroInferRequest::InferImpl() {
updateInOutIfNeeded();
size_t i = 0;
void InferImpl() override;
+ void SetBlob(const char* name, const InferenceEngine::Blob::Ptr& data) override;
+
void GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap) const override;
void updateInOutIfNeeded();
list(REMOVE_ITEM LIBRARY_SRC ${IE_STATIC_DEPENDENT_FILES})
set(IE_BASE_SOURCE_FILES
+ ${CMAKE_CURRENT_SOURCE_DIR}/cnn_network_ngraph_impl.cpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/generic_ie.cpp
${CMAKE_CURRENT_SOURCE_DIR}/blob_factory.cpp
${CMAKE_CURRENT_SOURCE_DIR}/ie_data.cpp
${CMAKE_CURRENT_SOURCE_DIR}/ie_layouts.cpp
"${IE_MAIN_SOURCE_DIR}/src/plugin_api/*.h")
add_cpplint_target(${TARGET_NAME}_plugin_api_cpplint FOR_SOURCES ${plugin_api_src})
-add_clang_format_target(${TARGET_NAME}_plugin_api_clang_format FOR_SOURCES ${plugin_api_src})
# Create common base object library
target_compile_definitions(${TARGET_NAME}_common_obj PRIVATE IMPLEMENT_INFERENCE_ENGINE_API)
target_include_directories(${TARGET_NAME}_common_obj PRIVATE
+ $<TARGET_PROPERTY:${TARGET_NAME}_transformations,INTERFACE_INCLUDE_DIRECTORIES>
$<TARGET_PROPERTY:${TARGET_NAME}_plugin_api,INTERFACE_INCLUDE_DIRECTORIES>)
target_include_directories(${TARGET_NAME}_common_obj SYSTEM PRIVATE
$<TARGET_PROPERTY:pugixml,INTERFACE_INCLUDE_DIRECTORIES>)
target_include_directories(${TARGET_NAME}_obj PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}"
+ $<TARGET_PROPERTY:${TARGET_NAME}_ir_readers,INTERFACE_INCLUDE_DIRECTORIES>
$<TARGET_PROPERTY:${TARGET_NAME}_plugin_api,INTERFACE_INCLUDE_DIRECTORIES>)
if(ENABLE_PROFILING_ITT AND INTEL_ITT_LIBS)
endif()
add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME}_obj)
-add_clang_format_target(${TARGET_NAME}_clang_format FOR_TARGETS ${TARGET_NAME}_obj)
# Create shared library file from object library
"${IE_MAIN_SOURCE_DIR}/src/legacy_api/src")
add_cpplint_target(${TARGET_NAME}_nn_builder_cpplint FOR_TARGETS ${TARGET_NAME}_nn_builder)
-add_clang_format_target(${TARGET_NAME}_nn_builder_clang_format FOR_TARGETS ${TARGET_NAME}_nn_builder)
# Static library used for unit tests which are always built
install(FILES "${TBB}/LICENSE"
DESTINATION ${IE_CPACK_IE_DIR}/external/tbb
COMPONENT tbb)
+ install(FILES "${TBB}/cmake/TBBConfig.cmake"
+ "${TBB}/cmake/TBBConfigVersion.cmake"
+ DESTINATION ${IE_CPACK_IE_DIR}/external/tbb/cmake
+ COMPONENT tbb)
endif()
ie_cpack_add_component(core REQUIRED DEPENDS ${core_components})
install(DIRECTORY "${IE_MAIN_SOURCE_DIR}/include" DESTINATION ${IE_CPACK_IE_DIR}
COMPONENT core)
install(TARGETS ${TARGET_NAME} ${TARGET_NAME}_nn_builder
- RUNTIME DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
- ARCHIVE DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
+ RUNTIME DESTINATION ${IE_CPACK_RUNTIME_PATH} COMPONENT core
+ ARCHIVE DESTINATION ${IE_CPACK_ARCHIVE_PATH} COMPONENT core
LIBRARY DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core)
install(FILES "${OpenVINO_BINARY_DIR}/share/ie_parallel.cmake"
"${OpenVINO_BINARY_DIR}/share/InferenceEngineConfig.cmake"
return std::make_shared<InferenceEngine::TBlob<int8_t>>(desc);
case InferenceEngine::Precision::I32:
return std::make_shared<InferenceEngine::TBlob<int32_t>>(desc);
+ case InferenceEngine::Precision::BF16:
+ return std::make_shared<InferenceEngine::TBlob<short>>(desc);
default:
THROW_IE_EXCEPTION << "precision is no set";
}
#include "graph_tools.hpp"
#include "graph_transformer.h"
#include "ie_util_internal.hpp"
-#include "ie_cnn_layer_builder_ngraph.h"
#include "ie_ngraph_utils.hpp"
#include "ie_profiling.hpp"
#include "network_serializer.h"
if (ptr) {
ptr->reshape(dims, ptr->getTensorDesc().getLayout());
} else {
- const auto precision = details::ngraph::convertPrecision(output.get_element_type());
+ const auto precision = details::convertPrecision(output.get_element_type());
const auto layout = TensorDesc::getLayoutByDims(dims);
ptr.reset(new NGraphData(this, outName, {precision, dims, layout}));
}
::ngraph::pass::ConvertOpSet1ToLegacy().run_on_function(graph);
cnnNetwork = InferenceEngine::details::convertFunctionToICNNNetwork(graph, *this);
}
-
-std::shared_ptr<CNNNetworkNGraphImpl> CNNNetworkNGraphImpl::cloneNGraphImpl() const {
- auto result = std::make_shared<CNNNetworkNGraphImpl>(cloneFunction());
- for (const auto& outputInfo : _outputData) {
- result->_outputData[outputInfo.first]->setPrecision(outputInfo.second->getPrecision());
- result->_outputData[outputInfo.first]->setLayout(outputInfo.second->getLayout());
- }
- for (const auto& inputInfo : _inputData) {
- result->_inputData[inputInfo.first]->setPrecision(inputInfo.second->getPrecision());
- result->_inputData[inputInfo.first]->setLayout(inputInfo.second->getLayout());
- result->_inputData[inputInfo.first]->getPreProcess() = inputInfo.second->getPreProcess();
- }
- if (cnnNetwork)
- result->cnnNetwork = cloneNet(*cnnNetwork);
- return result;
-}
-
-void CNNNetworkNGraphImpl::transformConstants() {
- if (!cnnNetwork)
- convertToCNNNetworkImpl();
- // Remove all redundant constant and convert unsupported precisions
- ConstTransformer transformator(cnnNetwork.get());
- transformator.fullTrim();
-}
-
-void InferenceEngine::details::CNNLayerCreator::on_adapter(const std::string& name,
- ::ngraph::ValueAccessor<void>& adapter) {
- if (auto a = ::ngraph::as_type<::ngraph::AttributeAdapter<::ngraph::element::Type>>(&adapter)) {
- auto type = static_cast<::ngraph::element::Type&>(*a);
- params[name] = details::ngraph::convertPrecision(type).name();
- } else if (auto a = ::ngraph::as_type<::ngraph::AttributeAdapter<::ngraph::PartialShape>>(&adapter)) {
- std::string dims;
- auto shape = static_cast<::ngraph::PartialShape&>(*a);
- for (size_t i = 0; i < shape.rank().get_length(); i++) {
- if (!dims.empty()) dims += ",";
- dims += std::to_string(shape[i].get_length());
- }
- params[name] = dims;
- } else if (auto a = ::ngraph::as_type<::ngraph::AttributeAdapter<::ngraph::Shape>>(&adapter)) {
- std::string dims;
- auto shape = static_cast<::ngraph::Shape&>(*a);
- for (size_t i = 0; i < shape.size(); i++) {
- if (!dims.empty()) dims += ",";
- dims += std::to_string(shape[i]);
- }
- params[name] = dims;
- } else if (auto a = ::ngraph::as_type<::ngraph::AttributeAdapter<::ngraph::Strides>>(&adapter)) {
- std::string dims;
- auto shape = static_cast<::ngraph::Strides&>(*a);
- for (size_t i = 0; i < shape.size(); i++) {
- if (!dims.empty()) dims += ",";
- dims += std::to_string(shape[i]);
- }
- params[name] = dims;
- }
-}
-
-InferenceEngine::details::CNNLayerCreator::CNNLayerCreator(const std::shared_ptr<::ngraph::Node>& node): node(node) {
- addSpecificCreator({"Parameter"}, [](const std::shared_ptr<::ngraph::Node>& node,
- const std::map<std::string, std::string> params) -> CNNLayerPtr {
- LayerParams attrs = {node->get_friendly_name(), "Input",
- details::ngraph::convertPrecision(node->get_output_element_type(0))};
- auto res = std::make_shared<CNNLayer>(attrs);
- return res;
- });
- // TODO - Remove "GreaterEq" once ngraph transitions to GreaterEqual
- addSpecificCreator({"Eltwise", "Subtract", "Power", "Maximum", "Divide", "Greater", "GreaterEqual", "FloorMod", "LogicalOr", "LogicalAnd", "LogicalXor",
- "GreaterEq", "Less", "LessEqual", "Equal", "NotEqual", "Multiply", "Add"}, [](const std::shared_ptr<::ngraph::Node>& node,
- const std::map<std::string, std::string> params) -> CNNLayerPtr {
- LayerParams attrs = {node->get_friendly_name(), "Eltwise",
- details::ngraph::convertPrecision(node->get_output_element_type(0))};
- auto res = std::make_shared<EltwiseLayer>(attrs);
- res->params = params;
- if (node->description() == "Maximum") {
- res->params["operation"] = "max";
- } else if (node->description() == "Power") {
- res->params["operation"] = "pow";
- } else if (node->description() == "Subtract") {
- res->params["operation"] = "sub";
- } else if (node->description() == "Divide") {
- res->params["operation"] = "div";
- } else if (node->description() == "LessEqual") {
- res->params["operation"] = "less_equal";
- } else if (node->description() == "Less") {
- res->params["operation"] = "less";
- } else if (node->description() == "Equal") {
- res->params["operation"] = "equal";
- } else if (node->description() == "NotEqual") {
- res->params["operation"] = "not_equal";
- } else if (node->description() == "FloorMod") {
- res->params["operation"] = "floor_mod";
- } else if (node->description() == "Multiply") {
- res->params["operation"] = "prod";
- } else if (node->description() == "Add") {
- res->params["operation"] = "sum";
- } else if (node->description() == "Greater") {
- res->params["operation"] = "greater";
- } else if (node->description() == "GreaterEq") {
- res->params["operation"] = "greater_equal";
- } else if (node->description() == "GreaterEqual") {
- res->params["operation"] = "greater_equal";
- } else if (node->description() == "LogicalOr") {
- res->params["operation"] = "logical_or";
- } else if (node->description() == "LogicalAnd") {
- res->params["operation"] = "logical_and";
- } else if (node->description() == "LogicalXor") {
- res->params["operation"] = "logical_xor";
- } else if (node->description() == "Eltwise") {
- auto castedLayer = std::dynamic_pointer_cast<::ngraph::op::Eltwise>(node);
- if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << attrs.type << " layer " << attrs.name;
- std::string type;
- switch (castedLayer->eltwise_type) {
- case ELTWISE_TYPE::Sum:
- type = "sum";
- break;
- case ELTWISE_TYPE::Prod:
- type = "prod";
- break;
- default:
- THROW_IE_EXCEPTION << "Not supported eltwise type!";
- }
-
- res->params["operation"] = type;
- }
- return res;
- });
- addSpecificCreator({"Concat"}, [](const std::shared_ptr<::ngraph::Node>& node,
- const std::map<std::string, std::string> params) -> CNNLayerPtr {
- LayerParams attrs = {node->get_friendly_name(), node->description(),
- details::ngraph::convertPrecision(node->get_output_element_type(0))};
- auto res = std::make_shared<ConcatLayer>(attrs);
- res->params = params;
- return res;
- });
- addSpecificCreator({"AvgPool", "MaxPool"}, [](const std::shared_ptr<::ngraph::Node>& node,
- const std::map<std::string, std::string> params) -> CNNLayerPtr {
- LayerParams attrs = {node->get_friendly_name(), "Pooling",
- details::ngraph::convertPrecision(node->get_output_element_type(0))};
- auto res = std::make_shared<PoolingLayer>(attrs);
- res->params = params;
- if (res->params.find("auto_pad") != res->params.end() &&
- details::CaselessEq<std::string>()(res->params["auto_pad"], "EXPLICIT"))
- res->params.erase("auto_pad");
-
- if (res->params.find("exclude_pad") != res->params.end()) {
- res->params["exclude-pad"] = res->params["exclude_pad"];
- res->params.erase("exclude_pad");
- }
-
- if (node->description() == "MaxPool") {
- res->params["pool-method"] = "max";
- } else if (node->description() == "AvgPool") {
- res->params["pool-method"] = "avg";
- }
- return res;
- });
- addSpecificCreator({"Select"}, [](const std::shared_ptr<::ngraph::Node>& node,
- const std::map<std::string, std::string> params) -> CNNLayerPtr {
- LayerParams attrs = {node->get_friendly_name(), node->description(),
- details::ngraph::convertPrecision(node->get_output_element_type(0))};
- auto res = std::make_shared<SelectLayer>(attrs);
- res->params = params;
- return res;
- });
- addSpecificCreator({"BinaryConvolution"}, [](const std::shared_ptr<::ngraph::Node>& node,
- const std::map<std::string, std::string> params) -> CNNLayerPtr {
- LayerParams attrs = {node->get_friendly_name(), node->description(),
- details::ngraph::convertPrecision(node->get_output_element_type(0))};
- auto res = std::make_shared<BinaryConvolutionLayer>(attrs);
-
- // todo: investigate difference between ngraph parameters for BinConvolution and the implementation above
- // this leads to accuracy issue for Precollected_ONNX_ResNet50_88percentinto1bit e2e test
- // res->params = params;
-
- auto castedLayer = ::ngraph::as_type_ptr<::ngraph::op::v1::BinaryConvolution>(node);
-
- std::string value;
- for (const auto& val : castedLayer->get_pads_begin()) {
- if (!value.empty()) value += ",";
- value += Builder::asString(val);
- }
- res->params["pads_begin"] = value;
-
- value.clear();
- for (const auto& val : castedLayer->get_pads_end()) {
- if (!value.empty()) value += ",";
- value += Builder::asString(val);
- }
- res->params["pads_end"] = value;
-
- switch (castedLayer->get_auto_pad()) {
- case ::ngraph::op::PadType::SAME_UPPER:
- res->params["auto_pad"] = "same_upper";
- break;
- case ::ngraph::op::PadType::SAME_LOWER:
- res->params["auto_pad"] = "same_lower";
- break;
- case ::ngraph::op::PadType::VALID:
- res->params["auto_pad"] = "valid";
- break;
- default:
- break;
- }
-
- value.clear();
- for (const auto& val : castedLayer->get_strides()) {
- if (!value.empty()) value += ",";
- value += Builder::asString(val);
- }
- res->params["strides"] = value;
-
- value.clear();
- for (const auto& val : castedLayer->get_dilations()) {
- if (!value.empty()) value += ",";
- value += Builder::asString(val);
- }
- res->params["dilations"] = value;
-
- // Restore kernel size and output
- const auto& shape = castedLayer->get_input_shape(1);
- res->params["output"] = Builder::asString(shape[0]);
-
- value.clear();
- for (size_t i = 2; i < shape.size(); i++) {
- if (!value.empty()) value += ",";
- value += Builder::asString(shape[i]);
- }
- res->params["kernel"] = value;
-
- switch (castedLayer->get_mode()) {
- case ::ngraph::op::v1::BinaryConvolution::BinaryConvolutionMode::XNOR_POPCOUNT:
- res->params["mode"] = "xnor-popcount";
- }
-
- auto weights_shape = castedLayer->input(1).get_source_output().get_shape();
- res->params["input"] = Builder::asString(weights_shape[1]);
- res->params["pad_value"] = Builder::asString(castedLayer->get_pad_value());
-
- Builder::NodeConverter<::ngraph::op::Constant> converter;
-
- const auto weightsNode = castedLayer->get_inputs()[1].get_output().get_node();
- if (converter.canCreate(weightsNode)) {
- const auto& weights = converter.createLayer(weightsNode);
- res->blobs["weights"] = weights->blobs["custom"];
- res->_weights = weights->blobs["custom"];
- }
- return res;
- });
-
- addSpecificCreator({"SpaceToBatch"}, [](const std::shared_ptr<::ngraph::Node>& node,
- const std::map<std::string, std::string> params) -> CNNLayerPtr {
- LayerParams attrs = {node->get_friendly_name(), node->description(),
- details::ngraph::convertPrecision(node->get_output_element_type(0))};
- auto res = std::make_shared<SpaceToBatchLayer>(attrs);
- res->params = params;
- return res;
- });
-
- addSpecificCreator({"BatchToSpace"}, [](const std::shared_ptr<::ngraph::Node>& node,
- const std::map<std::string, std::string> params) -> CNNLayerPtr {
- LayerParams attrs = {node->get_friendly_name(), node->description(),
- details::ngraph::convertPrecision(node->get_output_element_type(0))};
- auto res = std::make_shared<BatchToSpaceLayer>(attrs);
- res->params = params;
- return res;
- });
-}
-
-CNNLayerPtr InferenceEngine::details::CNNLayerCreator::create() {
- auto one_from = [](const std::string& desc, const std::vector<std::string>& descs) -> bool {
- for (const auto& d : descs) {
- if (details::CaselessEq<std::string>()(d, desc)) return true;
- }
- return false;
- };
- LayerParams attrs = {node->get_friendly_name(), node->description(),
- details::ngraph::convertPrecision(node->get_output_element_type(0))};
- if (creators.find(node->description()) != creators.end())
- return creators[node->description()](node, params);
-
- auto res = std::make_shared<CNNLayer>(attrs);
- res->params = params;
- return res;
-}
std::shared_ptr<const ::ngraph::Function> getFunction() const noexcept override {
return !cnnNetwork ? _ngraph_function : nullptr;
}
- std::shared_ptr<::ngraph::Function> getFunction() noexcept {
+ std::shared_ptr<::ngraph::Function> getFunction() noexcept override {
return !cnnNetwork ? _ngraph_function : nullptr;
}
noexcept override;
void convertToCNNNetworkImpl();
-
- std::shared_ptr<CNNNetworkNGraphImpl> cloneNGraphImpl() const;
- void transformConstants();
protected:
std::shared_ptr<::ngraph::Function> _ngraph_function;
virtual std::shared_ptr<::ngraph::Function> cloneFunction(bool constFolding = false, const std::map<std::string,
friend INFERENCE_ENGINE_API_CPP(std::shared_ptr<CNNNetworkImpl>)
convertFunctionToICNNNetwork(const std::shared_ptr<const ::ngraph::Function>& graph,
- const CNNNetworkNGraphImpl & nGraphImpl);
+ const ICNNNetwork& nGraphImpl);
/**
* @brief Reshape on the same shape
IE_SUPPRESS_DEPRECATED_END
-/**
- * @brief Creator for CNNLayer from nGraph op
- */
-class CNNLayerCreator : public ::ngraph::AttributeVisitor {
-public:
- using CreatorFor = std::function<CNNLayerPtr(const std::shared_ptr<::ngraph::Node>& node,
- const std::map<std::string, std::string> param)>;
- explicit CNNLayerCreator(const std::shared_ptr<::ngraph::Node>& node);
-
- CNNLayerPtr create();
-
- void on_attribute(const std::string& name, std::string& value) override {
- params[name] = value;
- }
-
- void on_attribute(const std::string& name, bool& value) override {
- params[name] = value ? "true" : "false";
- }
-
- void addSpecificCreator(const std::vector<std::string>& forTypes, const CreatorFor& creator) {
- for (const auto type : forTypes) {
- creators[type] = creator;
- }
- }
-
- void on_adapter(const std::string& name, ::ngraph::ValueAccessor<std::string>& adapter) override {
- std::string data = adapter.get();
- std::transform(data.begin(), data.end(), data.begin(), [](unsigned char c) {
- return std::tolower(c);
- });
- params[name] = data;
- }
-
- void on_adapter(const std::string& name, ::ngraph::ValueAccessor<std::vector<int64_t>>& adapter) override {
- std::string dims;
- auto shape = adapter.get();
- for (size_t i = 0; i < shape.size(); i++) {
- if (!dims.empty()) dims += ",";
- dims += std::to_string(shape[i]);
- }
- params[name] = dims;
- }
-
- void on_adapter(const std::string& name, ::ngraph::ValueAccessor<double>& adapter) override {
- params[name] = std::to_string(adapter.get());
- }
-
- void on_adapter(const std::string& name, ::ngraph::ValueAccessor<int64_t>& adapter) override {
- params[name] = std::to_string(adapter.get());
- }
-
- void on_adapter(const std::string& name, ::ngraph::ValueAccessor<void>& adapter) override;
-
-private:
- std::shared_ptr<::ngraph::Node> node;
- std::map<std::string, std::string> params;
- std::map<std::string, CreatorFor> creators;
-};
-
typedef std::shared_ptr<CNNNetworkNGraphImpl> CNNNetworkNGraphImplPtr;
} // namespace details
} // namespace InferenceEngine
// Set dynamic output shapes if input shapes are not defined
for (size_t i = 0; i < outputs.size(); i++) {
const auto& port = outputs[i];
- auto type = InferenceEngine::details::ngraph::convertPrecision(port.precision);
+ auto type = InferenceEngine::details::convertPrecision(port.precision);
set_output_type(i, type, PartialShape::dynamic());
}
return;
Shape this_ishape = get_input_shape(i);
InferenceEngine::SizeVector dims = this_ishape;
InferenceEngine::Blob::Ptr input = make_blob_with_precision(InferenceEngine::TensorDesc(
- InferenceEngine::details::ngraph::convertPrecision(get_input_element_type(i)), dims,
+ InferenceEngine::details::convertPrecision(get_input_element_type(i)), dims,
InferenceEngine::TensorDesc::getLayoutByDims(dims)));
inputs.emplace_back(input);
}
}
}
+ // WA: Proposal shape infer has to know number of outputs
+ if (type == "Proposal" && parameters.find("num_outputs") == parameters.end()) {
+ parameters["num_outputs"] = std::to_string(outputs.size());
+ }
+
ret = impl->inferShapes(inputs, parameters, blobs, outShapes, nullptr);
IE_SUPPRESS_DEPRECATED_END
for (size_t i = 0; i < outputs.size(); i++) {
const auto& port = outputs[i];
ngraph::Shape outShape(outShapes[i]);
- auto type = InferenceEngine::details::ngraph::convertPrecision(port.precision);
+ auto type = InferenceEngine::details::convertPrecision(port.precision);
set_output_type(i, type, PartialShape(outShape));
}
for (size_t i = 0; i < outputs.size(); i++) {
const auto& port = outputs[i];
ngraph::Shape outShape(port.dims);
- auto type = InferenceEngine::details::ngraph::convertPrecision(port.precision);
+ auto type = InferenceEngine::details::convertPrecision(port.precision);
set_output_type(i, type, PartialShape(outShape));
}
initialized++;
#include <vector>
#include <ngraph/opsets/opset.hpp>
+#include "cpp/ie_cnn_net_reader.h"
#include "cpp_interfaces/base/ie_plugin_base.hpp"
#include "details/ie_exception_conversion.hpp"
#include "details/ie_so_pointer.hpp"
#include "file_utils.h"
-#include "ie_cnn_net_reader_impl.h"
#include "ie_icore.hpp"
-#include "ie_ir_reader.hpp"
#include "ie_plugin.hpp"
#include "ie_plugin_config.hpp"
#include "ie_profiling.hpp"
namespace {
+std::once_flag flag;
+std::shared_ptr<InferenceEngine::details::SharedObjectLoader> cnnReaderLoader;
+
+std::shared_ptr<InferenceEngine::details::SharedObjectLoader>
+createCnnReaderLoader() {
+ std::call_once(flag, [&] () {
+ FileUtils::FilePath libraryName = FileUtils::toFilePath(std::string("inference_engine_ir_readers") + std::string(IE_BUILD_POSTFIX));
+ FileUtils::FilePath irReadersLibraryPath = FileUtils::makeSharedLibraryName(getInferenceEngineLibraryPath(), libraryName);
+
+ if (!FileUtils::fileExist(irReadersLibraryPath)) {
+ THROW_IE_EXCEPTION << "Please, make sure that Inference Engine IR readers library "
+ << FileUtils::fromFilePath(::FileUtils::makeSharedLibraryName({}, libraryName)) << " is in "
+ << getIELibraryPath();
+ }
+ cnnReaderLoader = std::shared_ptr<InferenceEngine::details::SharedObjectLoader>(
+ new InferenceEngine::details::SharedObjectLoader(irReadersLibraryPath.c_str()));
+ });
+
+ return cnnReaderLoader;
+}
+
IInferencePluginAPI* getInferencePluginAPIInterface(IInferencePlugin* iplugin) {
return dynamic_cast<IInferencePluginAPI*>(iplugin);
}
} // namespace
+CNNNetReaderPtr CreateCNNNetReaderPtr() noexcept {
+ auto loader = createCnnReaderLoader();
+ return CNNNetReaderPtr(loader);
+}
+
IE_SUPPRESS_DEPRECATED_END
DeviceIDParser::DeviceIDParser(const std::string& deviceNameWithID) {
}
class Core::Impl : public ICore {
+ // Fields are ordered by deletion order
ITaskExecutor::Ptr _taskExecutor = nullptr;
IE_SUPPRESS_DEPRECATED_START
std::vector<FileUtils::FilePath> listOfExtentions;
};
- std::map<std::string, PluginDescriptor> pluginRegistry;
std::unordered_set<std::string> opsetNames;
std::vector<IExtensionPtr> extensions;
+ std::map<std::string, PluginDescriptor> pluginRegistry;
+
public:
Impl();
~Impl() override;
{
// for compatibility with samples / demo
- if (deviceName.find("HETERO:") == 0) {
- deviceNames = DeviceIDParser::getHeteroDevices(deviceName.substr(7));
+ if (deviceName.find("HETERO") == 0) {
+ auto pos = deviceName.find_first_of(":");
+ if (pos != std::string::npos) {
+ deviceNames = DeviceIDParser::getHeteroDevices(deviceName.substr(pos + 1));
+ }
deviceNames.push_back("HETERO");
} else if (deviceName.find("MULTI") == 0) {
+ auto pos = deviceName.find_first_of(":");
+ if (pos != std::string::npos) {
+ deviceNames = DeviceIDParser::getMultiDevices(deviceName.substr(pos + 1));
+ }
deviceNames.push_back("MULTI");
- deviceNames = DeviceIDParser::getMultiDevices(deviceName.substr(6));
} else {
deviceNames.push_back(deviceName);
}
CNNNetwork Core::ReadNetwork(const std::string& modelPath, const std::string& binPath) const {
IE_PROFILING_AUTO_SCOPE(Core::ReadNetwork)
IE_SUPPRESS_DEPRECATED_START
- auto cnnReader = std::shared_ptr<ICNNNetReader>(CreateCNNNetReader());
ResponseDesc desc;
+ CNNNetReaderPtr cnnReader(createCnnReaderLoader());
StatusCode rt = cnnReader->ReadNetwork(modelPath.c_str(), &desc);
if (rt != OK) THROW_IE_EXCEPTION << desc.msg;
- auto cnnNetReaderImpl = std::dynamic_pointer_cast<details::CNNNetReaderImpl>(cnnReader);
- if (cnnNetReaderImpl && cnnReader->getVersion(&desc) >= 10) {
- cnnNetReaderImpl->addExtensions(_impl->getExtensions());
+ if (cnnReader->getVersion(&desc) >= 10) {
+ cnnReader->addExtensions(_impl->getExtensions());
}
std::string bPath = binPath;
if (bPath.empty()) {
CNNNetwork Core::ReadNetwork(const std::string& model, const Blob::CPtr& weights) const {
IE_PROFILING_AUTO_SCOPE(Core::ReadNetwork)
IE_SUPPRESS_DEPRECATED_START
- auto cnnReader = std::shared_ptr<ICNNNetReader>(CreateCNNNetReader());
ResponseDesc desc;
+ CNNNetReaderPtr cnnReader(createCnnReaderLoader());
StatusCode rt = cnnReader->ReadNetwork(model.data(), model.length(), &desc);
if (rt != OK) THROW_IE_EXCEPTION << desc.msg;
- auto cnnNetReaderImpl = std::dynamic_pointer_cast<details::CNNNetReaderImpl>(cnnReader);
- if (cnnNetReaderImpl && cnnReader->getVersion(&desc) >= 10) {
- cnnNetReaderImpl->addExtensions(_impl->getExtensions());
+ if (cnnReader->getVersion(&desc) >= 10) {
+ cnnReader->addExtensions(_impl->getExtensions());
}
TBlob<uint8_t>::Ptr weights_ptr;
if (weights) {
rt = cnnReader->SetWeights(weights_ptr, &desc);
if (rt != OK) THROW_IE_EXCEPTION << desc.msg;
IE_SUPPRESS_DEPRECATED_END
+
return CNNNetwork(cnnReader);
}
THROW_IE_EXCEPTION << "SetConfig is supported only for HETERO itself (without devices). "
"You can configure the devices with SetConfig before creating the HETERO on top.";
}
-
- if (config.find("TARGET_FALLBACK") != config.end()) {
- THROW_IE_EXCEPTION << "Please, specify TARGET_FALLBACK to the LoadNetwork directly, "
- "as you will need to pass the same TARGET_FALLBACK anyway.";
- }
}
// MULTI case
THROW_IE_EXCEPTION << "SetConfig is supported only for MULTI itself (without devices). "
"You can configure the devices with SetConfig before creating the MULTI on top.";
}
-
- if (config.find(MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES) != config.end()) {
- THROW_IE_EXCEPTION << "Please, specify DEVICE_PRIORITIES to the LoadNetwork directly, "
- "as you will need to pass the same DEVICE_PRIORITIES anyway.";
- }
}
if (deviceName.empty()) {
#include <ie_parameter.hpp>
#include <ie_iextension.h>
#include <ie_extension.h>
+
#include <ngraph/opsets/opset.hpp>
using namespace InferenceEngine;
template struct InferenceEngine::Parameter::RealData<std::vector<unsigned long>>;
template struct InferenceEngine::Parameter::RealData<std::tuple<unsigned int, unsigned int>>;
template struct InferenceEngine::Parameter::RealData<std::tuple<unsigned int, unsigned int, unsigned int>>;
+template struct InferenceEngine::Parameter::RealData<InferenceEngine::Blob::Ptr>;
#endif // __clang__
//
// ie_blob.h
#endif
}
+bool with_cpu_x86_bfloat16() {
+#ifdef ENABLE_MKL_DNN
+ return cpu.has(Xbyak::util::Cpu::tAVX512_BF16);
+#else
+ return false;
+#endif
+}
bool checkOpenMpEnvVars(bool includeOMPNumThreads) {
for (auto&& var : {
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
namespace InferenceEngine {
ITaskExecutor::Ptr ExecutorManagerImpl::getExecutor(std::string id) {
+ std::lock_guard<std::mutex> guard(taskExecutorMutex);
auto foundEntry = executors.find(id);
if (foundEntry == executors.end()) {
auto newExec = std::make_shared<CPUStreamsExecutor>(IStreamsExecutor::Config{id});
}
IStreamsExecutor::Ptr ExecutorManagerImpl::getIdleCPUStreamsExecutor(const IStreamsExecutor::Config& config) {
+ std::lock_guard<std::mutex> guard(streamExecutorMutex);
for (const auto& it : cpuStreamsExecutors) {
const auto& executor = it.second;
if (executor.use_count() != 1)
}
void ExecutorManagerImpl::clear(const std::string& id) {
+ std::lock_guard<std::mutex> stream_guard(streamExecutorMutex);
+ std::lock_guard<std::mutex> task_guard(taskExecutorMutex);
if (id.empty()) {
executors.clear();
cpuStreamsExecutors.clear();
}
}
+std::mutex ExecutorManager::_mutex;
ExecutorManager* ExecutorManager::_instance = nullptr;
+ExecutorManager* ExecutorManager::getInstance() {
+ /*
+ * 1) We do not use singleton implementation via STATIC LOCAL object like
+ *
+ * getInstance() {
+ * static ExecutorManager _instance;
+ * return &instance;
+ * }
+ *
+ * Because of problem with destruction order on program exit.
+ * Some IE classes like MKLDNN::Engine use this singleton in destructor.
+ * But they has no direct dependency from c++ runtime point of view and
+ * it's possible that _instance local static variable will be destroyed
+ * before MKLDNN::~Engine call. Any further manipulation with destroyed
+ * object will lead to exception or crashes.
+ *
+ * 2) We do not use singleton implementation via STATIC object like:
+ *
+ * ExecutorManager ExecutorManager::_instance;
+ * getInstance() {
+ * return &instance;
+ * }
+ *
+ * Because of problem with double destruction. In some test cases we use
+ * double link with IE module via static and dynamic version. Both modules
+ * have static object with same export name and it leads to double construction
+ * and double destruction of that object. For some c++ compilers (ex gcc 5.4)
+ * it lead to crash with "double free".
+ *
+ * That's why we use manual allocation of singleton instance on heap.
+ */
+ std::lock_guard<std::mutex> guard(_mutex);
+ if (_instance == nullptr) {
+ _instance = new ExecutorManager();
+ }
+ return _instance;
+}
+
ITaskExecutor::Ptr ExecutorManager::getExecutor(std::string id) {
return _impl.getExecutor(id);
}
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
--- /dev/null
+# Copyright (C) 2018-2019 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+set(TARGET_NAME "inference_engine_ir_readers")
+
+if(ENABLE_LTO)
+ ie_enable_lto()
+endif()
+
+set(PUBLIC_HEADERS_DIR "${CMAKE_CURRENT_SOURCE_DIR}/")
+
+file(GLOB_RECURSE LIBRARY_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
+file(GLOB_RECURSE PUBLIC_HEADERS ${PUBLIC_HEADERS_DIR}/*.h ${PUBLIC_HEADERS_DIR}/*.hpp)
+
+# Create named folders for the sources within the .vcproj
+# Empty name lists them directly under the .vcproj
+
+source_group("src" FILES ${LIBRARY_SRC})
+source_group("include" FILES ${PUBLIC_HEADERS})
+
+# Create shared library
+
+add_library(${TARGET_NAME} SHARED ${LIBRARY_SRC} ${PUBLIC_HEADERS})
+
+target_compile_definitions(${TARGET_NAME} PRIVATE IMPLEMENT_INFERENCE_ENGINE_API
+ IMPLEMENT_INFERENCE_ENGINE_PLUGIN)
+
+target_include_directories(${TARGET_NAME} PUBLIC ${PUBLIC_HEADERS_DIR})
+target_include_directories(${TARGET_NAME} PRIVATE "${IE_MAIN_SOURCE_DIR}/src/inference_engine")
+
+target_link_libraries(${TARGET_NAME} PUBLIC inference_engine_plugin_api ${NGRAPH_LIBRARIES} inference_engine)
+target_link_libraries(${TARGET_NAME} PRIVATE pugixml)
+
+# code style
+
+add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME})
+add_clang_format_target(${TARGET_NAME}_clang_format FOR_TARGETS ${TARGET_NAME})
+
+# developer package
+
+ie_developer_export_targets(${TARGET_NAME})
+
+# install
+
+install(TARGETS ${TARGET_NAME}
+ RUNTIME DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
+ ARCHIVE DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
+ LIBRARY DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core)
#include "ie_format_parser.h"
#include "ie_ir_reader.hpp"
#include "ie_profiling.hpp"
+#include "ie_plugin.hpp"
#include "parsers.h"
#include "xml_parse_utils.h"
CNNNetReaderImpl::CNNNetReaderImpl(const FormatParserCreator::Ptr& _creator)
: parseSuccess(false), _version(0), parserCreator(_creator) {}
+CNNNetReaderImpl::~CNNNetReaderImpl() { }
+
StatusCode CNNNetReaderImpl::SetWeights(const TBlob<uint8_t>::Ptr& weights, ResponseDesc* desc) noexcept {
if (!_parser && _version < 10) {
return DescriptionBuffer(desc) << "network must be read first";
}
try {
if (_version == 10) {
-#if defined(ENABLE_IR_READER)
// It's time to perform actual reading of V10 network and instantiate CNNNetworkNGraphImpl
IRReader v10Reader(extensions);
std::stringstream model;
xmlDoc->save(model);
network = std::make_shared<CNNNetworkNGraphImpl>(v10Reader.read(model.str(), weights));
-#else
- return DescriptionBuffer(desc) << "Please, recompile Inference Engine with the ENABLE_IR_READER=ON Cmake option";
-#endif
} else {
_parser->SetWeights(weights);
}
}
std::shared_ptr<IFormatParser> V2FormatParserCreator::create(size_t version) {
-#ifdef ENABLE_IR_READER
return std::make_shared<FormatParser>(version);
-#else
- THROW_IE_EXCEPTION << "Please, recompile Inference Engine library with the ENABLE_IR_READER=ON Cmake option";
- return nullptr;
-#endif
}
-InferenceEngine::ICNNNetReader* InferenceEngine::CreateCNNNetReader() noexcept {
- return new CNNNetReaderImpl(std::make_shared<V2FormatParserCreator>());
+INFERENCE_PLUGIN_API(InferenceEngine::StatusCode)
+CreateICNNNetReader(ICNNNetReader *& data, ResponseDesc *resp) noexcept {
+ data = new CNNNetReaderImpl(std::make_shared<V2FormatParserCreator>());
+ return StatusCode::OK;
}
+
IE_SUPPRESS_DEPRECATED_END
#include <vector>
#include "cnn_network_impl.hpp"
-#include "ie_icnn_net_reader.h"
#include "ie_memcpy.h"
#include "ie_profiling.hpp"
#include "parsers.h"
+#include "ie_util_internal.hpp"
namespace pugi {
class xml_node;
virtual ~FormatParserCreator() = default;
};
-struct V2FormatParserCreator : public FormatParserCreator {
+struct INFERENCE_ENGINE_API_CLASS(V2FormatParserCreator) : public FormatParserCreator {
std::shared_ptr<IFormatParser> create(size_t version) override;
};
IE_SUPPRESS_DEPRECATED_START
-class CNNNetReaderImpl : public ICNNNetReader {
+class INFERENCE_ENGINE_API_CLASS(CNNNetReaderImpl) : public ICNNNetReader {
public:
- explicit CNNNetReaderImpl(const FormatParserCreator::Ptr& _parserCreator);
+ explicit CNNNetReaderImpl(const FormatParserCreator::Ptr& _creator);
StatusCode ReadNetwork(const char* filepath, ResponseDesc* resp) noexcept override;
delete this;
}
- void addExtensions(const std::vector<InferenceEngine::IExtensionPtr>& ext);
+ void addExtensions(const std::vector<InferenceEngine::IExtensionPtr>& ext) override;
+
+ ~CNNNetReaderImpl() override;
private:
std::shared_ptr<InferenceEngine::details::IFormatParser> _parser;
std::shared_ptr<pugi::xml_document> xmlDoc;
std::vector<InferenceEngine::IExtensionPtr> extensions;
};
+
IE_SUPPRESS_DEPRECATED_END
} // namespace details
std::make_shared<LayerCreator<TopKLayer>>("TopK"),
std::make_shared<LayerCreator<UniqueLayer>>("Unique"),
std::make_shared<LayerCreator<NonMaxSuppressionLayer>>("NonMaxSuppression"),
- std::make_shared<LayerCreator<ScatterLayer>>("ScatterUpdate"),
+ std::make_shared<LayerCreator<ScatterUpdateLayer>>("ScatterUpdate"),
std::make_shared<LayerCreator<ExperimentalDetectronPriorGridGeneratorLayer>>("ExperimentalDetectronPriorGridGenerator"),
- std::make_shared<LayerCreator<ExperimentalDetectronGenerateProposalsSingleImageLayer>>("ExperimentalDetectronGenerateProposalsSingleImage")};
+ std::make_shared<LayerCreator<ExperimentalDetectronGenerateProposalsSingleImageLayer>>("ExperimentalDetectronGenerateProposalsSingleImage"),
+ std::make_shared<LayerCreator<ExperimentalDetectronTopKROIs>>("ExperimentalDetectronTopKROIs")};
creators.emplace_back(_version < 6 ? std::make_shared<LayerCreator<QuantizeLayer>>("Quantize")
: std::make_shared<LayerCreator<QuantizeLayer>>("FakeQuantize"));
}
}
};
-#ifdef ENABLE_IR_READER
class INFERENCE_ENGINE_API_CLASS(FormatParser): public IFormatParser {
-#else
-class FormatParser : public IFormatParser {
-#endif
public:
explicit FormatParser(size_t version);
// Input port hasn't precision
if (!input) {
const std::string& preStr = GetStrAttr(parentNode, "precision");
- type = InferenceEngine::details::ngraph::convertPrecision(preStr);
+ type = InferenceEngine::details::convertPrecision(preStr);
}
port.precision = type;
return port;
for (const auto& port : params.outputPorts) {
ngraph::op::GenericIE::PortIE iePort;
iePort.dims = port.dims;
- iePort.precision = InferenceEngine::details::ngraph::convertPrecision(port.precision);
+ iePort.precision = InferenceEngine::details::convertPrecision(port.precision);
outputs.emplace_back(iePort);
}
THROW_IE_EXCEPTION << "Cannot read parameter for " << getType() << " layer with name: " << layerParsePrms.name;
return std::make_shared<ngraph::op::Convert>(inputs[0],
- details::ngraph::convertPrecision(GetStrAttr(dn, "destination_type")));
+ details::convertPrecision(GetStrAttr(dn, "destination_type")));
}
// LSTMCell layer
std::string val;
if (!getStrAttribute(node.child("data"), name, val)) return;
if (auto a = ngraph::as_type<ngraph::AttributeAdapter<ngraph::element::Type>>(&adapter)) {
- static_cast<ngraph::element::Type&>(*a) = details::ngraph::convertPrecision(val);
+ static_cast<ngraph::element::Type&>(*a) = details::convertPrecision(val);
} else if (auto a = ngraph::as_type<ngraph::AttributeAdapter<ngraph::PartialShape>>(&adapter)) {
std::vector<int64_t> shape;
std::vector<ngraph::Dimension> dims;
* All methods here do not throw exceptions and return a StatusCode and ResponseDesc object.
* Alternatively, to use methods that throw exceptions, refer to the CNNNetReader wrapper class.
*/
-#ifdef ENABLE_IR_READER
class INFERENCE_ENGINE_API_CLASS(IRReader) {
-#else
-class IRReader {
-#endif
public:
IRReader() = default;
explicit IRReader(const std::vector<IExtensionPtr>& exts): extensions(exts) {}
target_compile_definitions(${TARGET_NAME}_obj PRIVATE IMPLEMENT_INFERENCE_ENGINE_API)
target_include_directories(${TARGET_NAME}_obj PRIVATE ${PUBLIC_HEADERS_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/src
+ ${IE_MAIN_SOURCE_DIR}/src/inference_engine # For CNNNetworkNGraphImpl
+ $<TARGET_PROPERTY:inference_engine_transformations,INTERFACE_INCLUDE_DIRECTORIES>
$<TARGET_PROPERTY:inference_engine_plugin_api,INTERFACE_INCLUDE_DIRECTORIES>
$<TARGET_PROPERTY:ngraph::ngraph,INTERFACE_INCLUDE_DIRECTORIES>
$<TARGET_PROPERTY:pugixml,INTERFACE_INCLUDE_DIRECTORIES>)
set_ie_threading_interface_for(${TARGET_NAME})
-target_link_libraries(${TARGET_NAME} PRIVATE ${NGRAPH_LIBRARIES} pugixml)
+target_link_libraries(${TARGET_NAME} PRIVATE ${NGRAPH_LIBRARIES} inference_engine_transformations pugixml)
add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME})
-add_clang_format_target(${TARGET_NAME}_clang_format FOR_TARGETS ${TARGET_NAME})
# export targets
# install
install(TARGETS ${TARGET_NAME}
- RUNTIME DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
- ARCHIVE DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
+ RUNTIME DESTINATION ${IE_CPACK_RUNTIME_PATH} COMPONENT core
+ ARCHIVE DESTINATION ${IE_CPACK_ARCHIVE_PATH} COMPONENT core
LIBRARY DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core)
precision = prec;
}
+ std::shared_ptr<::ngraph::Function> getFunction() noexcept override {
+ return nullptr;
+ }
+
std::shared_ptr<const ::ngraph::Function> getFunction() const noexcept override {
return nullptr;
}
#pragma once
-#include "cnn_network_ngraph_impl.hpp"
+#include "cnn_network_impl.hpp"
+#include <ngraph/attribute_visitor.hpp>
#include <memory>
+#include <string>
+#include <vector>
namespace InferenceEngine {
namespace details {
+
INFERENCE_ENGINE_API_CPP(std::shared_ptr<CNNNetworkImpl>)
-convertFunctionToICNNNetwork(const std::shared_ptr<const ::ngraph::Function>& graph, const CNNNetworkNGraphImpl &nGraphImpl);
+convertFunctionToICNNNetwork(const std::shared_ptr<const ::ngraph::Function>& graph, const ICNNNetwork &network);
+
} // namespace details
} // namespace InferenceEngine
*/
class INFERENCE_ENGINE_API_CLASS(ConstTransformer) {
public:
+ explicit ConstTransformer(ICNNNetwork* _network);
explicit ConstTransformer(details::CNNNetworkImpl* _network);
explicit ConstTransformer(std::vector<DataPtr> &_inputs, std::vector<DataPtr> &_outputs);
namespace InferenceEngine {
namespace details {
-namespace ngraph {
inline ::ngraph::element::Type convertPrecision(const Precision& precision) {
Precision::ePrecision pType = precision;
return ::ngraph::element::Type(::ngraph::element::Type_t::f32);
case Precision::FP16:
return ::ngraph::element::Type(::ngraph::element::Type_t::f16);
+ case Precision::BF16:
+ return ::ngraph::element::Type(::ngraph::element::Type_t::bf16);
case Precision::U8:
return ::ngraph::element::Type(::ngraph::element::Type_t::u8);
case Precision::I8:
return ::ngraph::element::Type(::ngraph::element::Type_t::f16);
} else if (precision == "f32" || precision == "FP32") {
return ::ngraph::element::Type(::ngraph::element::Type_t::f32);
+ } else if (precision == "bf16" || precision == "BF16") {
+ return ::ngraph::element::Type(::ngraph::element::Type_t::bf16);
} else if (precision == "f64" || precision == "FP64") {
return ::ngraph::element::Type(::ngraph::element::Type_t::f64);
} else if (precision == "i8" || precision == "I8") {
return Precision(Precision::FP16);
case ::ngraph::element::Type_t::f32:
return Precision(Precision::FP32);
+ case ::ngraph::element::Type_t::bf16:
+ return Precision(Precision::BF16);
case ::ngraph::element::Type_t::i8:
return Precision(Precision::I8);
case ::ngraph::element::Type_t::i16:
}
}
-} // namespace ngraph
} // namespace details
} // namespace InferenceEngine
#include <cpp/ie_cnn_network.h>
+#include <ie_icnn_network.hpp>
#include <cnn_network_impl.hpp>
#include <file_utils.h>
#include <deque>
IE_SUPPRESS_DEPRECATED_END
/**
+ * @brief Clones the whole network without conversion to CNNNetworkImpl. All layers and data objects will be cloned
+ * @note Blobs inside layers are reused
+ * @param network A network to clone
+ * @return A cloned object
+ */
+INFERENCE_ENGINE_API_CPP(std::shared_ptr<InferenceEngine::ICNNNetwork>)
+cloneNetwork(const InferenceEngine::ICNNNetwork& network);
+
+/**
* @brief Clones the whole network. All layers and data objects will be cloned
* @note Blobs inside layers are reused
* @param network A network to clone
ReshapeLayer*, TileLayer*, ScaleShiftLayer*, PReLULayer*, PowerLayer*, BatchNormalizationLayer*,
ClampLayer*, TensorIterator*, LSTMCell*, GRUCell*, RNNCell*, RNNSequenceLayer*, QuantizeLayer*,
BinaryConvolutionLayer*, WeightableLayer*, OneHotLayer*, MathLayer*, ReduceLayer*, UniqueLayer*,
- NonMaxSuppressionLayer*, ScatterLayer*, ExperimentalDetectronPriorGridGeneratorLayer*,
- ExperimentalDetectronGenerateProposalsSingleImageLayer*, CNNLayer*>;
+ NonMaxSuppressionLayer*, ScatterUpdateLayer*, ExperimentalDetectronPriorGridGeneratorLayer*,
+ ExperimentalDetectronGenerateProposalsSingleImageLayer*, ExperimentalDetectronTopKROIs*, CNNLayer*>;
template <class Visitor, std::size_t I = 0, typename... Tp>
inline typename std::enable_if<I == sizeof...(Tp), void>::type visitActualLayer(std::tuple<Tp...>&& t,
}
prev = *it;
}
- symQuant = *(intervals.begin());
+ if (!intervals.empty()) {
+ symQuant = *(intervals.begin());
+ }
std::set<double> divs;
prev = 0.f;
for (auto it = individualsG.begin(); it != individualsG.end(); it++) {
#include <vector>
#include <unordered_set>
+#include <cnn_network_ngraph_impl.hpp>
#include "ngraph_ops/convolution_ie.hpp"
#include "ngraph_ops/deconvolution_ie.hpp"
#include "ngraph_ops/eltwise.hpp"
#include "ie_profiling.hpp"
#include "ie_cnn_layer_builder_ngraph.h"
+#include <debug.h>
#include "transformations/convert_opset1_to_legacy/convert_opset1_to_legacy.hpp"
#include "transformations/utils/utils.hpp"
namespace InferenceEngine {
namespace details {
-std::shared_ptr<CNNNetworkImpl> convertFunctionToICNNNetwork(const std::shared_ptr<const ::ngraph::Function>& graph, const CNNNetworkNGraphImpl &nGraphImpl) {
+
+/**
+ * @brief Creator for CNNLayer from nGraph op
+ */
+class CNNLayerCreator : public ::ngraph::AttributeVisitor {
+public:
+ using CreatorFor = std::function<CNNLayerPtr(const std::shared_ptr<::ngraph::Node>& node,
+ const std::map<std::string, std::string> param)>;
+ explicit CNNLayerCreator(const std::shared_ptr<::ngraph::Node>& node);
+
+ CNNLayerPtr create();
+
+ void on_attribute(const std::string& name, std::string& value) override {
+ params[name] = value;
+ }
+
+ void on_attribute(const std::string& name, bool& value) override {
+ params[name] = value ? "true" : "false";
+ }
+
+ void addSpecificCreator(const std::vector<std::string>& forTypes, const CreatorFor& creator) {
+ for (const auto type : forTypes) {
+ creators[type] = creator;
+ }
+ }
+
+ void on_adapter(const std::string& name, ::ngraph::ValueAccessor<std::string>& adapter) override {
+ std::string data = adapter.get();
+ std::transform(data.begin(), data.end(), data.begin(), [](unsigned char c) {
+ return std::tolower(c);
+ });
+ params[name] = data;
+ }
+
+ void on_adapter(const std::string& name, ::ngraph::ValueAccessor<std::vector<int64_t>>& adapter) override {
+ auto shape = adapter.get();
+ params[name] = joinVec(shape);
+ }
+
+ void on_adapter(const std::string& name, ::ngraph::ValueAccessor<double>& adapter) override {
+ params[name] = std::to_string(adapter.get());
+ }
+
+ void on_adapter(const std::string& name, ::ngraph::ValueAccessor<int64_t>& adapter) override {
+ params[name] = std::to_string(adapter.get());
+ }
+
+ void on_adapter(const std::string& name, ::ngraph::ValueAccessor<void>& adapter) override;
+
+private:
+ std::shared_ptr<::ngraph::Node> node;
+ std::map<std::string, std::string> params;
+ std::map<std::string, CreatorFor> creators;
+};
+
+void InferenceEngine::details::CNNLayerCreator::on_adapter(const std::string& name,
+ ::ngraph::ValueAccessor<void>& adapter) {
+ if (auto a = ::ngraph::as_type<::ngraph::AttributeAdapter<::ngraph::element::Type>>(&adapter)) {
+ auto type = static_cast<::ngraph::element::Type&>(*a);
+ params[name] = details::convertPrecision(type).name();
+ } else if (auto a = ::ngraph::as_type<::ngraph::AttributeAdapter<::ngraph::PartialShape>>(&adapter)) {
+ std::string dims;
+ auto shape = static_cast<::ngraph::PartialShape&>(*a);
+ for (size_t i = 0; i < shape.rank().get_length(); i++) {
+ if (!dims.empty()) dims += ",";
+ dims += std::to_string(shape[i].get_length());
+ }
+ params[name] = dims;
+ } else if (auto a = ::ngraph::as_type<::ngraph::AttributeAdapter<::ngraph::Shape>>(&adapter)) {
+ auto shape = static_cast<::ngraph::Shape&>(*a);
+ params[name] = joinVec(shape);
+ } else if (auto a = ::ngraph::as_type<::ngraph::AttributeAdapter<::ngraph::Strides>>(&adapter)) {
+ auto shape = static_cast<::ngraph::Strides&>(*a);
+ params[name] = joinVec(shape);
+ }
+}
+
+InferenceEngine::details::CNNLayerCreator::CNNLayerCreator(const std::shared_ptr<::ngraph::Node>& node): node(node) {
+ addSpecificCreator({"Parameter"}, [](const std::shared_ptr<::ngraph::Node>& node,
+ const std::map<std::string, std::string> params) -> CNNLayerPtr {
+ LayerParams attrs = {node->get_friendly_name(), "Input",
+ details::convertPrecision(node->get_output_element_type(0))};
+ auto res = std::make_shared<CNNLayer>(attrs);
+ return res;
+ });
+ // TODO - Remove "GreaterEq" once ngraph transitions to GreaterEqual
+ addSpecificCreator({"Eltwise", "Subtract", "Power", "Maximum", "Divide", "Greater", "GreaterEqual", "FloorMod", "LogicalOr", "LogicalAnd", "LogicalXor",
+ "GreaterEq", "Less", "LessEqual", "Equal", "NotEqual", "Multiply", "Add"}, [](const std::shared_ptr<::ngraph::Node>& node,
+ const std::map<std::string, std::string> params) -> CNNLayerPtr {
+ LayerParams attrs = {node->get_friendly_name(), "Eltwise",
+ details::convertPrecision(node->get_output_element_type(0))};
+ auto res = std::make_shared<EltwiseLayer>(attrs);
+ res->params = params;
+ if (node->description() == "Maximum") {
+ res->params["operation"] = "max";
+ } else if (node->description() == "Power") {
+ res->params["operation"] = "pow";
+ } else if (node->description() == "Subtract") {
+ res->params["operation"] = "sub";
+ } else if (node->description() == "Divide") {
+ res->params["operation"] = "div";
+ } else if (node->description() == "LessEqual") {
+ res->params["operation"] = "less_equal";
+ } else if (node->description() == "Less") {
+ res->params["operation"] = "less";
+ } else if (node->description() == "Equal") {
+ res->params["operation"] = "equal";
+ } else if (node->description() == "NotEqual") {
+ res->params["operation"] = "not_equal";
+ } else if (node->description() == "FloorMod") {
+ res->params["operation"] = "floor_mod";
+ } else if (node->description() == "Multiply") {
+ res->params["operation"] = "prod";
+ } else if (node->description() == "Add") {
+ res->params["operation"] = "sum";
+ } else if (node->description() == "Greater") {
+ res->params["operation"] = "greater";
+ } else if (node->description() == "GreaterEq") {
+ res->params["operation"] = "greater_equal";
+ } else if (node->description() == "GreaterEqual") {
+ res->params["operation"] = "greater_equal";
+ } else if (node->description() == "LogicalOr") {
+ res->params["operation"] = "logical_or";
+ } else if (node->description() == "LogicalAnd") {
+ res->params["operation"] = "logical_and";
+ } else if (node->description() == "LogicalXor") {
+ res->params["operation"] = "logical_xor";
+ } else if (node->description() == "Eltwise") {
+ auto castedLayer = std::dynamic_pointer_cast<::ngraph::op::Eltwise>(node);
+ if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << attrs.type << " layer " << attrs.name;
+ std::string type;
+ switch (castedLayer->eltwise_type) {
+ case ELTWISE_TYPE::Sum:
+ type = "sum";
+ break;
+ case ELTWISE_TYPE::Prod:
+ type = "prod";
+ break;
+ default:
+ THROW_IE_EXCEPTION << "Not supported eltwise type!";
+ }
+
+ res->params["operation"] = type;
+ }
+ return res;
+ });
+ addSpecificCreator({"Concat"}, [](const std::shared_ptr<::ngraph::Node>& node,
+ const std::map<std::string, std::string> params) -> CNNLayerPtr {
+ LayerParams attrs = {node->get_friendly_name(), node->description(),
+ details::convertPrecision(node->get_output_element_type(0))};
+ auto res = std::make_shared<ConcatLayer>(attrs);
+ res->params = params;
+ return res;
+ });
+ addSpecificCreator({"AvgPool", "MaxPool"}, [](const std::shared_ptr<::ngraph::Node>& node,
+ const std::map<std::string, std::string> params) -> CNNLayerPtr {
+ LayerParams attrs = {node->get_friendly_name(), "Pooling",
+ details::convertPrecision(node->get_output_element_type(0))};
+ auto res = std::make_shared<PoolingLayer>(attrs);
+ res->params = params;
+ if (res->params.find("auto_pad") != res->params.end() &&
+ details::CaselessEq<std::string>()(res->params["auto_pad"], "EXPLICIT"))
+ res->params.erase("auto_pad");
+
+ if (res->params.find("exclude_pad") != res->params.end()) {
+ res->params["exclude-pad"] = res->params["exclude_pad"];
+ res->params.erase("exclude_pad");
+ }
+
+ if (node->description() == "MaxPool") {
+ res->params["pool-method"] = "max";
+ } else if (node->description() == "AvgPool") {
+ res->params["pool-method"] = "avg";
+ }
+ return res;
+ });
+ addSpecificCreator({"Select"}, [](const std::shared_ptr<::ngraph::Node>& node,
+ const std::map<std::string, std::string> params) -> CNNLayerPtr {
+ LayerParams attrs = {node->get_friendly_name(), node->description(),
+ details::convertPrecision(node->get_output_element_type(0))};
+ auto res = std::make_shared<SelectLayer>(attrs);
+ res->params = params;
+ return res;
+ });
+ addSpecificCreator({"BinaryConvolution"}, [](const std::shared_ptr<::ngraph::Node>& node,
+ const std::map<std::string, std::string> params) -> CNNLayerPtr {
+ LayerParams attrs = {node->get_friendly_name(), node->description(),
+ details::convertPrecision(node->get_output_element_type(0))};
+ auto res = std::make_shared<BinaryConvolutionLayer>(attrs);
+
+ // todo: investigate difference between ngraph parameters for BinConvolution and the implementation above
+ // this leads to accuracy issue for Precollected_ONNX_ResNet50_88percentinto1bit e2e test
+ // res->params = params;
+
+ auto castedLayer = ::ngraph::as_type_ptr<::ngraph::op::v1::BinaryConvolution>(node);
+
+ std::string value;
+ for (const auto& val : castedLayer->get_pads_begin()) {
+ if (!value.empty()) value += ",";
+ value += Builder::asString(val);
+ }
+ res->params["pads_begin"] = value;
+
+ value.clear();
+ for (const auto& val : castedLayer->get_pads_end()) {
+ if (!value.empty()) value += ",";
+ value += Builder::asString(val);
+ }
+ res->params["pads_end"] = value;
+
+ switch (castedLayer->get_auto_pad()) {
+ case ::ngraph::op::PadType::SAME_UPPER:
+ res->params["auto_pad"] = "same_upper";
+ break;
+ case ::ngraph::op::PadType::SAME_LOWER:
+ res->params["auto_pad"] = "same_lower";
+ break;
+ case ::ngraph::op::PadType::VALID:
+ res->params["auto_pad"] = "valid";
+ break;
+ default:
+ break;
+ }
+
+ value.clear();
+ for (const auto& val : castedLayer->get_strides()) {
+ if (!value.empty()) value += ",";
+ value += Builder::asString(val);
+ }
+ res->params["strides"] = value;
+
+ value.clear();
+ for (const auto& val : castedLayer->get_dilations()) {
+ if (!value.empty()) value += ",";
+ value += Builder::asString(val);
+ }
+ res->params["dilations"] = value;
+
+ // Restore kernel size and output
+ const auto& shape = castedLayer->get_input_shape(1);
+ res->params["output"] = Builder::asString(shape[0]);
+
+ value.clear();
+ for (size_t i = 2; i < shape.size(); i++) {
+ if (!value.empty()) value += ",";
+ value += Builder::asString(shape[i]);
+ }
+ res->params["kernel"] = value;
+
+ switch (castedLayer->get_mode()) {
+ case ::ngraph::op::v1::BinaryConvolution::BinaryConvolutionMode::XNOR_POPCOUNT:
+ res->params["mode"] = "xnor-popcount";
+ }
+
+ auto weights_shape = castedLayer->input(1).get_source_output().get_shape();
+ res->params["input"] = Builder::asString(weights_shape[1]);
+ res->params["pad_value"] = Builder::asString(castedLayer->get_pad_value());
+
+ Builder::NodeConverter<::ngraph::op::Constant> converter;
+
+ const auto weightsNode = castedLayer->get_inputs()[1].get_output().get_node();
+ if (converter.canCreate(weightsNode)) {
+ const auto& weights = converter.createLayer(weightsNode);
+ res->blobs["weights"] = weights->blobs["custom"];
+ res->_weights = weights->blobs["custom"];
+ }
+ return res;
+ });
+
+ addSpecificCreator({"SpaceToBatch"}, [](const std::shared_ptr<::ngraph::Node>& node,
+ const std::map<std::string, std::string> params) -> CNNLayerPtr {
+ LayerParams attrs = {node->get_friendly_name(), node->description(),
+ details::convertPrecision(node->get_output_element_type(0))};
+ auto res = std::make_shared<SpaceToBatchLayer>(attrs);
+ res->params = params;
+ return res;
+ });
+
+ addSpecificCreator({"BatchToSpace"}, [](const std::shared_ptr<::ngraph::Node>& node,
+ const std::map<std::string, std::string> params) -> CNNLayerPtr {
+ LayerParams attrs = {node->get_friendly_name(), node->description(),
+ details::convertPrecision(node->get_output_element_type(0))};
+ auto res = std::make_shared<BatchToSpaceLayer>(attrs);
+ res->params = params;
+ return res;
+ });
+}
+
+CNNLayerPtr InferenceEngine::details::CNNLayerCreator::create() {
+ auto one_from = [](const std::string& desc, const std::vector<std::string>& descs) -> bool {
+ for (const auto& d : descs) {
+ if (details::CaselessEq<std::string>()(d, desc)) return true;
+ }
+ return false;
+ };
+ LayerParams attrs = {node->get_friendly_name(), node->description(),
+ details::convertPrecision(node->get_output_element_type(0))};
+ if (creators.find(node->description()) != creators.end())
+ return creators[node->description()](node, params);
+
+ auto res = std::make_shared<CNNLayer>(attrs);
+ res->params = params;
+ return res;
+}
+
+std::shared_ptr<CNNNetworkImpl> convertFunctionToICNNNetwork(const std::shared_ptr<const ::ngraph::Function>& graph, const ICNNNetwork &network) {
IE_PROFILING_AUTO_SCOPE(convertFunctionToICNNNetwork)
const auto createCNNLayer = [](const std::shared_ptr<::ngraph::Node> &node) -> CNNLayerPtr {
class NGraphCNNLayer: public CNNLayer {
network->setInputInfo(info);
};
+ const CNNNetworkNGraphImpl* nGraphImpl = dynamic_cast<const CNNNetworkNGraphImpl*>(&network);
+
InputsDataMap thisInputDataMap;
- nGraphImpl.getInputsInfo(thisInputDataMap);
+ network.getInputsInfo(thisInputDataMap);
// Create network
auto cnnNetworkImpl = std::make_shared<details::CNNNetworkImpl>();
for (const auto &dim : dims) {
if (!dim)
THROW_IE_EXCEPTION << cnnLayer->type << " layer " << cnnLayer->name
- << " has incorrect dimensions in the output data " << i;
+ << " has incorrect dimensions in the output data " << i;
}
-
- if (!ptr && nGraphImpl._data.find(outName) != nGraphImpl._data.end()) {
- ptr = nGraphImpl._data.at(outName);
+ if (!ptr && nGraphImpl && nGraphImpl->_data.find(outName) != nGraphImpl->_data.end()) {
+ ptr = nGraphImpl->_data.at(outName);
if (auto nData = std::dynamic_pointer_cast<InferenceEngine::details::NGraphData>(ptr)) {
const auto layout =
- dims.size() == nData->getTensorDesc().getDims().size() ?
- nData->getTensorDesc().getLayout() :
- TensorDesc::getLayoutByDims(dims);
+ dims.size() == nData->getTensorDesc().getDims().size() ?
+ nData->getTensorDesc().getLayout() :
+ TensorDesc::getLayoutByDims(dims);
nData->reset();
nData->reshape(dims, layout);
}
cnnNetworkImpl->addData(outName.c_str(), ptr);
}
+
if (!ptr) {
ptr.reset(new Data(outName,
- {details::ngraph::convertPrecision(layer->get_output_element_type(i)), dims,
+ {details::convertPrecision(layer->get_output_element_type(i)), dims,
TensorDesc::getLayoutByDims(dims)}));
}
#include <vector>
#include <mutex>
+#include <cnn_network_ngraph_impl.hpp>
#include "blob_factory.hpp"
#include "cnn_network_impl.hpp"
#include "graph_tools.hpp"
THROW_IE_EXCEPTION << "[ERROR]: Failed to init ConstTransformer with null pointer of network";
}
+ConstTransformer::ConstTransformer(ICNNNetwork* _network) {
+ if (auto cnnNet = dynamic_cast<InferenceEngine::details::CNNNetworkImpl *>(_network)) {
+ network = cnnNet;
+ } else if (auto nGraphNet = dynamic_cast<InferenceEngine::details::CNNNetworkNGraphImpl *>(_network)) {
+ if (auto cnnNet = dynamic_cast<InferenceEngine::details::CNNNetworkImpl *>(nGraphNet->getCNNNetwork().get()))
+ network = cnnNet;
+ }
+ if (!network)
+ THROW_IE_EXCEPTION << "[ERROR]: Failed to init ConstTransformer with unsupported network type";
+ inputs = get_inputs(network);
+ outputs = get_outputs(network);
+}
+
ConstTransformer::ConstTransformer(std::vector<DataPtr> &_inputs, std::vector<DataPtr> &_outputs)
: inputs(_inputs), outputs(_outputs), network(nullptr) {
if (inputs.empty() || outputs.empty())
//
#include <ie_cnn_layer_builder_ngraph.h>
-#include "cnn_network_ngraph_impl.hpp"
+#include <cnn_network_ngraph_impl.hpp>
#include <precision_utils.h>
#include <cpp/ie_cnn_network.h>
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Abs>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Abs",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
return res;
}
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get layer " << layer->get_friendly_name();
LayerParams params = {layer->get_friendly_name(), castedLayer->getType(),
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
if (castedLayer->getType() == "RNNCell")
res = std::make_shared<InferenceEngine::RNNCell>(params);
// Create Inference Engine representation of TensorIterator
LayerParams params = {layer->get_friendly_name(), "TensorIterator",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::TensorIterator>(params);
// Body: inputs
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Constant>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Const",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::Constant>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Convert>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Convert",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
- auto p = details::ngraph::convertPrecision(layer->get_output_element_type(0));
+ auto p = details::convertPrecision(layer->get_output_element_type(0));
std::string precision_str;
switch (p) {
case Precision::FP16:
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Ceiling>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Ceiling",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
return res;
}
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Floor>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Floor",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
return res;
}
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Sigmoid>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Sigmoid",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
return res;
}
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Tanh>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "TanH",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
return res;
}
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Relu>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "ReLU",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::ReLULayer>(params);
return res;
}
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::SeluIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Selu",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::SeluIE>(layer);
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::ReLUIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "ReLU",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::ReLULayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::ReLUIE>(layer);
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Range>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Range",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
return res;
}
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Exp>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Exp",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
return res;
}
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::MVN>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "MVN",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::MVNLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::MVN>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::LRN_IE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Norm",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::NormLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::LRN_IE>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::CropIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Crop",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CropLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::CropIE>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Clamp>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Clamp",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::ClampLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::Clamp>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::v1::Softmax>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "SoftMax",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::SoftMaxLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::v1::Softmax>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Subtract>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Eltwise",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::EltwiseLayer>(params);
res->params["operation"] = "sub";
return res;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::v1::Power>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Eltwise",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::EltwiseLayer>(params);
res->params["operation"] = "pow";
return res;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::v1::Maximum>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Eltwise",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::EltwiseLayer>(params);
res->params["operation"] = "max";
return res;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::v1::Divide>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Eltwise",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::EltwiseLayer>(params);
res->params["operation"] = "div";
return res;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::v1::Multiply>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Eltwise",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::EltwiseLayer>(params);
res->params["operation"] = "prod";
return res;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::v1::Add>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Eltwise",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::EltwiseLayer>(params);
res->params["operation"] = "sum";
return res;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Squeeze>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Squeeze",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::Squeeze>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Unsqueeze>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Unsqueeze",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::Unsqueeze>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::FakeQuantize>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "FakeQuantize",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::QuantizeLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::FakeQuantize>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
CNNLayer::Ptr NodeConverter<ngraph::op::ConvolutionIE>::createLayer(
const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Convolution",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::ConvolutionLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::ConvolutionIE>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
CNNLayer::Ptr NodeConverter<ngraph::op::DeconvolutionIE>::createLayer(
const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Deconvolution",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::DeconvolutionLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::DeconvolutionIE>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
CNNLayer::Ptr NodeConverter<ngraph::op::v1::DeformableConvolution>::createLayer(
const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "DeformableConvolution",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::DeformableConvolutionLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::v1::DeformableConvolution>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::v1::AvgPool>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Pooling",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::PoolingLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::v1::AvgPool>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::v1::MaxPool>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Pooling",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::PoolingLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::v1::MaxPool>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::ROIPooling>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "ROIPooling",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::ROIPooling>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::PSROIPooling>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "PSROIPooling",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::PSROIPooling>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
CNNLayer::Ptr NodeConverter<ngraph::op::v1::DeformablePSROIPooling>::createLayer(
const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "PSROIPooling",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::v1::DeformablePSROIPooling>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::PRelu>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "PReLU",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::PReLULayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::PRelu>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::v1::Split>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Split",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::SplitLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::v1::Split>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::VariadicSplit>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Split",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::SplitLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::VariadicSplit>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Concat>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Concat",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::ConcatLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::Concat>(layer);
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::GatherIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Gather",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::GatherLayer>(params);
auto castedLayer = std::dynamic_pointer_cast<ngraph::op::GatherIE>(layer);
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::GatherTreeIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "GatherTree",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
return res;
}
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::ReverseSequence>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
- LayerParams params = {layer->get_friendly_name(), "ReverseSequence", details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ LayerParams params = {layer->get_friendly_name(), "ReverseSequence", details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::ReverseSequenceLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::ReverseSequence>(layer);
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Reshape>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Reshape",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::ReshapeLayer>(params);
return res;
}
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::ShapeOf>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "ShapeOf",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
return res;
}
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::v1::Reshape>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Reshape",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto castedLayer = ngraph::as_type_ptr<ngraph::op::v1::Reshape>(layer);
if (castedLayer == nullptr)
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::PadIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Pad",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::PadLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::PadIE>(layer);
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::ScaleShiftIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "ScaleShift",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::ScaleShiftLayer>(params);
NodeConverter<ngraph::op::Constant> converter;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Elu>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "elu",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::Elu>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::SquaredDifference>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Eltwise",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::EltwiseLayer>(params);
res->params["operation"] = "squared_diff";
return res;
CNNLayer::Ptr NodeConverter<ngraph::op::DetectionOutput>::createLayer(
const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "DetectionOutput",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::DetectionOutput>(layer);
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Transpose>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Permute",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
NodeConverter<ngraph::op::Constant> converter;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::ProposalIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Proposal",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::ProposalIE>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
CNNLayer::Ptr NodeConverter<ngraph::op::PriorBoxClusteredIE>::createLayer(
const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "PriorBoxClustered",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::PriorBoxClusteredIE>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::PriorBoxIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "PriorBox",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::PriorBoxIE>(layer);
auto layer_info = params.type + " layer " + params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::PowerIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Power",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::PowerLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::PowerIE>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::v1::TopK>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "TopK",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::TopKLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::v1::TopK>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::TopKIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "TopK",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::TopKLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::TopKIE>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Eltwise>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Eltwise",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::EltwiseLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::Eltwise>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::TileIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Tile",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::TileLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::TileIE>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::ResampleV2>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
- LayerParams params = {layer->get_friendly_name(), "Resample", details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ LayerParams params = {layer->get_friendly_name(), "Resample", details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::ResampleV2>(layer);
if (castedLayer == nullptr)
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Interp>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Resample",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto castedLayer = ngraph::as_type_ptr<ngraph::op::Interp>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
}
params = {layer->get_friendly_name(), "Interp",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
res->params["height"] = asString(attrs.height);
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::FullyConnected>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "FullyConnected",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto castedLayer = ngraph::as_type_ptr<ngraph::op::FullyConnected>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::LSTMCellIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "LSTMCell",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto castedLayer = ngraph::as_type_ptr<ngraph::op::LSTMCellIE>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::MatMul>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Gemm",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto castedLayer = ngraph::as_type_ptr<ngraph::op::MatMul>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::RegionYolo>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "RegionYolo",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::RegionYolo>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::ReorgYolo>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "ReorgYolo",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::ReorgYolo>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::v1::ReduceMin>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "ReduceMin",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::ReduceLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::v1::ReduceMin>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::v1::ReduceMax>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "ReduceMax",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::ReduceLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::v1::ReduceMax>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::v1::ReduceMean>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "ReduceMean",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::ReduceLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::v1::ReduceMean>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::v1::ReduceProd>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "ReduceProd",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::ReduceLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::v1::ReduceProd>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::v1::ReduceSum>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "ReduceSum",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::ReduceLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::v1::ReduceSum>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Log>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Log",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
return res;
}
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::NormalizeIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Normalize",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::NormLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::NormalizeIE>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
if (converter.canCreate(weightsNode)) {
const auto& weights = converter.createLayer(weightsNode);
res->blobs["weights"] = weights->blobs["custom"];
+ } else {
+ THROW_IE_EXCEPTION << "Cannot convert weight node for NormalizeIE op";
}
return res;
CNNLayer::Ptr NodeConverter<ngraph::op::CTCGreedyDecoder>::createLayer(
const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "CTCGreedyDecoder",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
auto castedLayer = ngraph::as_type_ptr<ngraph::op::CTCGreedyDecoder>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Erf>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Erf",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
return res;
}
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Sign>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Sign",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
return res;
}
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Sin>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Sin",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
return res;
}
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Sinh>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Sinh",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
return res;
}
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Asin>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Asin",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
return res;
}
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Cos>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Cos",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
return res;
}
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Cosh>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Cosh",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
return res;
}
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Acos>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Acos",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
return res;
}
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Tan>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Tan",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
return res;
}
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Atan>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Atan",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
return res;
}
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::Sqrt>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "Sqrt",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
return res;
}
CNNLayer::Ptr NodeConverter<ngraph::op::StridedSliceIE>::createLayer(
const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "StridedSlice",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::StridedSliceLayer>(params);
auto castedLayer = std::dynamic_pointer_cast<ngraph::op::StridedSliceIE>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::HardSigmoid_IE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
- LayerParams params = { layer->get_friendly_name(), "HardSigmoid", details::ngraph::convertPrecision(layer->get_output_element_type(0)) };
+ LayerParams params = { layer->get_friendly_name(), "HardSigmoid", details::convertPrecision(layer->get_output_element_type(0)) };
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
auto castedLayer = std::dynamic_pointer_cast<ngraph::op::HardSigmoid_IE>(layer);
if (castedLayer == nullptr)
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::GRN>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
LayerParams params = {layer->get_friendly_name(), "GRN",
- details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ details::convertPrecision(layer->get_output_element_type(0))};
auto castedLayer = std::dynamic_pointer_cast<ngraph::op::GRN>(layer);
if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::v1::LogicalNot>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
- LayerParams params = {layer->get_friendly_name(), "Activation", details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ LayerParams params = {layer->get_friendly_name(), "Activation", details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
res->params["type"] = "not";
return res;
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::v1::ReduceLogicalAnd>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
- LayerParams params = {layer->get_friendly_name(), "ReduceAnd", details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ LayerParams params = {layer->get_friendly_name(), "ReduceAnd", details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::ReduceLayer>(params);
auto castedLayer = std::dynamic_pointer_cast<ngraph::op::v1::ReduceLogicalAnd>(layer);
template <>
CNNLayer::Ptr NodeConverter<ngraph::op::v1::ReduceLogicalOr>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
- LayerParams params = {layer->get_friendly_name(), "ReduceOr", details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+ LayerParams params = {layer->get_friendly_name(), "ReduceOr", details::convertPrecision(layer->get_output_element_type(0))};
auto res = std::make_shared<InferenceEngine::ReduceLayer>(params);
auto castedLayer = std::dynamic_pointer_cast<ngraph::op::v1::ReduceLogicalOr>(layer);
Blob::Ptr shareWeights(const std::shared_ptr<ngraph::op::Constant>& constLayer) const {
if (!constLayer) THROW_IE_EXCEPTION << "Cannot share weights! Constant operation is empty!";
- auto dataPrecision = details::ngraph::convertPrecision(constLayer->get_element_type());
+ auto dataPrecision = details::convertPrecision(constLayer->get_element_type());
size_t shapeSize = ngraph::shape_size(constLayer->get_shape());
if (dataPrecision == Precision::BIN) {
PriorBoxClusteredValidator::PriorBoxClusteredValidator(const std::string& _type): LayerValidator(_type) {}
void ProposalValidator::parseParams(CNNLayer* layer) {
- layer->params["num_outputs"] = std::to_string(layer->outData.size());
+ if (layer->params.find("num_outputs") == layer->params.end()) {
+ layer->params["num_outputs"] = std::to_string(layer->outData.size());
+ }
}
void ProposalValidator::checkParams(const CNNLayer* layer) {
THROW_IE_EXCEPTION << layer->name << " 'score_threshold' should be scalar";
}
-ScatterValidator::ScatterValidator(const std::string& _type): LayerValidator(_type) {}
+ScatterUpdateValidator::ScatterUpdateValidator(const std::string& _type): LayerValidator(_type) {}
-void ScatterValidator::parseParams(CNNLayer* layer) {
- auto casted = dynamic_cast<ScatterLayer*>(layer);
+void ScatterUpdateValidator::parseParams(CNNLayer* layer) {
+ auto casted = dynamic_cast<ScatterUpdateLayer*>(layer);
if (!casted) {
- THROW_IE_EXCEPTION << layer->name << " Layer is not instance of ScatterLayer class";
+ THROW_IE_EXCEPTION << layer->name << " Layer is not instance of ScatterUpdateLayer class";
}
-
- casted->axis = casted->GetParamAsInt("axis", 0);
}
-void ScatterValidator::checkShapes(const CNNLayer* layer, const vector<SizeVector>& inShapes) const {
- auto casted = dynamic_cast<const ScatterLayer*>(layer);
+void ScatterUpdateValidator::checkShapes(const CNNLayer* layer, const vector<SizeVector>& inShapes) const {
+ auto casted = dynamic_cast<const ScatterUpdateLayer*>(layer);
if (!casted) {
- THROW_IE_EXCEPTION << layer->name << " Layer is not instance of ScatterLayer class";
+ THROW_IE_EXCEPTION << layer->name << " Layer is not instance of ScatterUpdateLayer class";
}
size_t numInputs = inShapes.size();
- if (numInputs != 3)
- THROW_IE_EXCEPTION << layer->name << " Scatter can take only 3 inputs, but actually it has: " << numInputs;
+ if (numInputs != 4)
+ THROW_IE_EXCEPTION << layer->name << " Scatter can take only 4 inputs, but actually it has: " << numInputs;
- if (!(-static_cast<int>(inShapes[0].size()) <= casted->axis && casted->axis < static_cast<int>(inShapes[0].size())))
- THROW_IE_EXCEPTION << layer->name << " Incorrect input parameters dimensions and axis number!";
+ static constexpr int DATA = 0;
+ static constexpr int INDICES = 1;
+ static constexpr int UPDATES = 2;
+ static constexpr int AXIS = 3;
- if (inShapes[0].size() == 0 || (inShapes[0].size() == 1 && inShapes[0][0] == 1))
- THROW_IE_EXCEPTION << layer->name << " 'Data' tensor rank should be >= 1";
+ if (inShapes[DATA].size() < 1)
+ THROW_IE_EXCEPTION << layer->name << " 'Data' tensor rank must be >= 1";
- if (inShapes[1].size() == 0 || (inShapes[1].size() == 1 && inShapes[1][0] == 1))
- THROW_IE_EXCEPTION << layer->name << " 'Indexes' tensor rank should be >= 1";
+ if (inShapes[INDICES].size() < 1)
+ THROW_IE_EXCEPTION << layer->name << " 'Indices' tensor rank must be >= 1";
- if (inShapes[1].size() == 0 || (inShapes[1].size() == 1 && inShapes[1][0] == 1))
- THROW_IE_EXCEPTION << layer->name << " 'Updates' tensor rank should be >= 1";
+ if (inShapes[UPDATES].size() < 1)
+ THROW_IE_EXCEPTION << layer->name << " 'Updates' tensor rank must be >= 1";
- if (inShapes[1] != inShapes[2])
- THROW_IE_EXCEPTION << layer->name << " Incorrect number of 'indexes' and 'updates' tensors dimension";
+ if (!(inShapes[AXIS].size() == 1 && inShapes[AXIS][0] == 1))
+ THROW_IE_EXCEPTION << layer->name << " 'Axis' tensor must be 1D array of 1 element";
- const size_t SCATTER_DATA = 0;
- const size_t SCATTER_INDEXES = 1;
- const size_t SCATTER_UPDATES = 2;
+ if (inShapes[UPDATES].size() != inShapes[INDICES].size() + inShapes[DATA].size() - 1)
+ THROW_IE_EXCEPTION << layer->name << " Incorrect number of 'indexes' and 'updates' tensors dimension";
- Precision inIdxPrecision = layer->insData[SCATTER_INDEXES].lock()->getTensorDesc().getPrecision();
+ Precision inIdxPrecision = layer->insData[INDICES].lock()->getTensorDesc().getPrecision();
if (inIdxPrecision != Precision::FP32 && inIdxPrecision != Precision::I32)
- THROW_IE_EXCEPTION << layer->name << " Incorrect input 'Indexes' precision. Only FP32 or I32 are supported!";
+ THROW_IE_EXCEPTION << layer->name << " Incorrect input 'Indices' precision. Only FP32 or I32 are supported!";
+
+ Precision inAxisPrecision = layer->insData[AXIS].lock()->getTensorDesc().getPrecision();
+ if (inAxisPrecision != Precision::FP32 && inAxisPrecision != Precision::I32)
+ THROW_IE_EXCEPTION << layer->name << " Incorrect input 'Axis' precision. Only FP32 or I32 are supported!";
- if (layer->insData[SCATTER_DATA].lock()->getTensorDesc().getPrecision() !=
- layer->insData[SCATTER_UPDATES].lock()->getTensorDesc().getPrecision())
+ if (layer->insData[DATA].lock()->getTensorDesc().getPrecision() !=
+ layer->insData[UPDATES].lock()->getTensorDesc().getPrecision())
THROW_IE_EXCEPTION << layer->name << " Precision should be equal for input tensors 'Data' and 'Updates'";
}
REG_LAYER_VALIDATOR_FOR_TYPE(TopKValidator, TopK);
REG_LAYER_VALIDATOR_FOR_TYPE(UniqueValidator, Unique);
REG_LAYER_VALIDATOR_FOR_TYPE(NMSValidator, NonMaxSuppression);
- REG_LAYER_VALIDATOR_FOR_TYPE(ScatterValidator, ScatterUpdate);
+ REG_LAYER_VALIDATOR_FOR_TYPE(ScatterUpdateValidator, ScatterUpdate);
}
} // namespace InferenceEngine
void checkShapes(const CNNLayer* layer, const std::vector<SizeVector>& inShapes) const override;
};
-class ScatterValidator : public LayerValidator {
+class ScatterUpdateValidator : public LayerValidator {
public:
- explicit ScatterValidator(const std::string& _type);
+ explicit ScatterUpdateValidator(const std::string& _type);
void parseParams(CNNLayer* layer) override;
TopKLayer::~TopKLayer() {}
UniqueLayer::~UniqueLayer() {}
NonMaxSuppressionLayer::~NonMaxSuppressionLayer() {}
-ScatterLayer::~ScatterLayer() {}
+ScatterUpdateLayer::~ScatterUpdateLayer() {}
ExperimentalDetectronPriorGridGeneratorLayer::~ExperimentalDetectronPriorGridGeneratorLayer() {}
ExperimentalDetectronGenerateProposalsSingleImageLayer::~ExperimentalDetectronGenerateProposalsSingleImageLayer() {}
+ExperimentalDetectronTopKROIs::~ExperimentalDetectronTopKROIs() {}
CNNLayerPtr clonelayer(const CNNLayer& source) {
using fptr = CNNLayerPtr (*)(const CNNLayer*);
// Most derived layers must go first in this list
- static const fptr cloners[] = {&layerCloneImpl<ExperimentalDetectronGenerateProposalsSingleImageLayer>,
+ static const fptr cloners[] = {&layerCloneImpl<ExperimentalDetectronTopKROIs>,
+ &layerCloneImpl<ExperimentalDetectronGenerateProposalsSingleImageLayer>,
&layerCloneImpl<ExperimentalDetectronPriorGridGeneratorLayer>,
- &layerCloneImpl<ScatterLayer>,
+ &layerCloneImpl<ScatterUpdateLayer>,
&layerCloneImpl<NonMaxSuppressionLayer>,
&layerCloneImpl<SelectLayer>,
&layerCloneImpl<BatchNormalizationLayer>,
return nullptr; // Silence "control may reach end of non-void function" warning
}
+std::shared_ptr<ICNNNetwork> cloneNetwork(const ICNNNetwork& network) {
+ if (auto func = network.getFunction()) {
+ CNNNetwork net(func);
+
+ InputsDataMap originInputs;
+ OutputsDataMap originOutputs;
+ network.getInputsInfo(originInputs);
+ network.getOutputsInfo(originOutputs);
+ InputsDataMap clonedInputs = net.getInputsInfo();
+ OutputsDataMap clonedOutputs = net.getOutputsInfo();
+
+ for (const auto& outputInfo : originOutputs) {
+ if (clonedOutputs.find(outputInfo.first) == clonedOutputs.end())
+ THROW_IE_EXCEPTION << "Cannot clone network! Cloned network doesn't contain all outputs";
+ clonedOutputs[outputInfo.first]->setPrecision(outputInfo.second->getPrecision());
+ clonedOutputs[outputInfo.first]->setLayout(outputInfo.second->getLayout());
+ }
+ for (const auto& inputInfo : originInputs) {
+ if (clonedInputs.find(inputInfo.first) == clonedInputs.end())
+ THROW_IE_EXCEPTION << "Cannot clone network! Cloned network doesn't contain all inputs";
+ clonedInputs[inputInfo.first]->setPrecision(inputInfo.second->getPrecision());
+ clonedInputs[inputInfo.first]->setLayout(inputInfo.second->getLayout());
+ clonedInputs[inputInfo.first]->getPreProcess() = inputInfo.second->getPreProcess();
+ }
+ return net;
+ }
+
+ return cloneNet(network);
+}
details::CNNNetworkImplPtr cloneNet(const ICNNNetwork& network) {
std::vector<CNNLayerPtr> layers;
details::CNNNetworkIterator i(&network);
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
if (dumpWeights && !node->blobs.empty()) {
auto blobsNode = layer.append_child("blobs");
for (const auto& dataIt : node->blobs) {
+ if (!dataIt.second) continue;
size_t dataSize = dataIt.second->byteSize();
pugi::xml_node data = blobsNode.append_child(dataIt.first.c_str());
data.append_attribute("offset").set_value(dataOffset);
for (auto&& node : ordered) {
if (!node->blobs.empty()) {
for (const auto& dataIt : node->blobs) {
+ if (!dataIt.second) continue;
const char* dataPtr = dataIt.second->buffer().as<char*>();
size_t dataSize = dataIt.second->byteSize();
stream.write(dataPtr, dataSize);
if (!stream.good()) {
- THROW_IE_EXCEPTION << "Error during writing blob waights";
+ THROW_IE_EXCEPTION << "Error during writing blob weights";
}
}
}
REG_SHAPE_INFER_FOR_TYPE(TopKShapeProp, TopK);
REG_SHAPE_INFER_FOR_TYPE(UniqueShapeProp, Unique);
REG_SHAPE_INFER_FOR_TYPE(NMSShapeProp, NonMaxSuppression);
-REG_SHAPE_INFER_FOR_TYPE(ScatterShapeProp, Scatter);
+REG_SHAPE_INFER_FOR_TYPE(ScatterUpdateShapeProp, ScatterUpdate);
} // namespace ShapeInfer
} // namespace InferenceEngine
namespace ShapeInfer {
/**
- *@brief Implementation of Shape inference for Scatter layer
+ *@brief Implementation of Shape inference for ScatterUpdate layer
*/
-class ScatterShapeProp : public BuiltInShapeInferImpl {
+class ScatterUpdateShapeProp : public BuiltInShapeInferImpl {
public:
- explicit ScatterShapeProp(const std::string& type): BuiltInShapeInferImpl(type) {}
+ explicit ScatterUpdateShapeProp(const std::string& type): BuiltInShapeInferImpl(type) {}
void inferShapesImpl(const std::vector<Blob::CPtr>& inBlobs, const std::map<std::string, std::string>& params,
const std::map<std::string, Blob::Ptr>& blobs, std::vector<SizeVector>& outShapes) override {
LayerParams lp {};
- ScatterLayer scatterLayer(lp);
- scatterLayer.params = params;
- scatterLayer.type = _type;
- validate(&scatterLayer, inBlobs, params, blobs);
+ ScatterUpdateLayer scatterUpdateLayer(lp);
+ scatterUpdateLayer.params = params;
+ scatterUpdateLayer.type = _type;
+ validate(&scatterUpdateLayer, inBlobs, params, blobs);
outShapes = {inShapes[0]};
}
$<TARGET_PROPERTY:inference_engine_plugin_api,INTERFACE_INCLUDE_DIRECTORIES>)
add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME})
-add_clang_format_target(${TARGET_NAME}_clang_format FOR_TARGETS ${TARGET_NAME})
# developer package
# install
install(TARGETS ${TARGET_NAME}
- RUNTIME DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
- ARCHIVE DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
+ RUNTIME DESTINATION ${IE_CPACK_RUNTIME_PATH} COMPONENT core
+ ARCHIVE DESTINATION ${IE_CPACK_ARCHIVE_PATH} COMPONENT core
LIBRARY DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core)
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bf16transformer.h"
+#include <string>
+#include <vector>
+#include <fstream>
+#include <utility>
+#include <set>
+#include <chrono>
+#include "details/ie_cnn_network_tools.h"
+#include "ie_util_internal.hpp"
+#include "ngraph/type/bfloat16.hpp"
+
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+using namespace InferenceEngine::details;
+
+void precisionColoringBF16(const CNNLayerPtr layer,
+ ordered_properties &printed_properties,
+ ordered_properties &node_properties) {
+ if (layer && !layer->insData.empty() && layer->input()) {
+ printed_properties.insert(printed_properties.begin(),
+ std::pair<std::string, std::string>("Precision",
+ layer->input()->getPrecision() == Precision::FP32 ? "FP32" : "BF16"));
+
+ if (layer->input()->getPrecision() == Precision::FP32) {
+ node_properties.emplace_back("fillcolor", "#5A5DF0");
+ } else {
+ node_properties.emplace_back("fillcolor", "#20F608");
+ }
+ }
+}
+
+void BF16Transformer::convertToFloat(InferenceEngine::CNNNetwork &network) {
+ // go over all edges and all edges having FP32 mark as BF16
+ std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(network);
+ InputsDataMap inputs = network.getInputsInfo();
+ OutputsDataMap outputs = network.getOutputsInfo();
+ for (auto iter : sortedLayers) {
+ for (size_t o = 0; o < iter->outData.size(); o++) {
+ if (inputs.find(iter->outData[o]->getName()) == inputs.end()
+ && outputs.find(iter->outData[o]->getName()) == outputs.end()
+ && iter->outData[o]->getPrecision() == Precision::BF16) {
+ iter->outData[o]->setPrecision(Precision::FP32);
+ }
+ }
+ }
+}
+
+void BF16Transformer::convertToBFloat16(InferenceEngine::CNNNetwork &network) {
+ // go over all edges and all edges having FP32 mark as BF16
+ std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(network);
+ InputsDataMap inputs = network.getInputsInfo();
+ OutputsDataMap outputs = network.getOutputsInfo();
+ for (auto iter : sortedLayers) {
+ for (size_t o = 0; o < iter->outData.size(); o++) {
+ if (inputs.find(iter->outData[o]->getName()) == inputs.end()
+ && outputs.find(iter->outData[o]->getName()) == outputs.end()
+ && iter->outData[o]->getPrecision() == Precision::FP32) {
+ iter->outData[o]->setPrecision(Precision::BF16);
+ }
+ }
+ }
+
+ // convert all edges back to FP32 on demand
+ optimizeToFloat(network);
+}
+
+void BF16Transformer::optimizeToFloat(InferenceEngine::CNNNetwork &network) {
+ std::set<DataPtr> toAnalyzeTensors;
+ std::set<DataPtr> immutable;
+ bool hasBF16Tensor = false;
+ std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(network);
+ // 1. Verify if we do not have bf16 tensors - it's better to return early and not to try to return anything since there is no such tensors
+ for (auto iter : sortedLayers) {
+ for (size_t i = 0; i < iter->insData.size(); i++) {
+ if (iter->insData[i].lock()->getTensorDesc().getPrecision() == Precision::BF16) {
+ hasBF16Tensor = true;
+ }
+ }
+ for (size_t o = 0; o < iter->outData.size(); o++) {
+ if (iter->outData[o]->getTensorDesc().getPrecision() == Precision::BF16) {
+ hasBF16Tensor = true;
+ }
+ }
+ }
+ if (!hasBF16Tensor) {
+ return;
+ }
+ // 2a. go over all inputs and outputs and put them to the toAnalyzeTensors
+ InputsDataMap inputs = network.getInputsInfo();
+ for (auto input : inputs) {
+ immutable.insert(input.second->getInputData());
+ if (input.second->getInputData()->getTensorDesc().getPrecision() != Precision::BF16) {
+ toAnalyzeTensors.insert(input.second->getInputData());
+ }
+ }
+
+ OutputsDataMap outputs = network.getOutputsInfo();
+ for (auto output : outputs) {
+ immutable.insert(output.second);
+ if (output.second->getTensorDesc().getPrecision() != Precision::BF16) {
+ toAnalyzeTensors.insert(output.second);
+ }
+ }
+
+ // 2b. go over all unknown layers for this algo and mark them as fp32 and add to the toAnalyzeTensors
+ // 2c. go over all inputs to _initbf16 and if they are fp32 - add them to the toAnalyzeTensors
+ for (auto iter : sortedLayers) {
+ if (_initbf16.find(iter->type) == _initbf16.end()
+ && _complementbf16.find(iter->type) == _complementbf16.end()
+ && _multiinput.find(iter->type) == _multiinput.end()) {
+ // try to mark inputs of the unknown layer
+ for (size_t i = 0; i < iter->insData.size(); i++) {
+ if (iter->insData[i].lock()->getPrecision() == Precision::BF16) {
+ bool marked = tryToMarkFP32(iter->insData[i].lock(), immutable);
+ if (marked) {
+ toAnalyzeTensors.insert(iter->insData[i].lock());
+ }
+ }
+ }
+ // try to mark outputs of the unknown layer
+ for (size_t o = 0; o < iter->outData.size(); o++) {
+ if (iter->outData[o]->getPrecision() == Precision::BF16) {
+ bool marked = tryToMarkFP32(iter->outData[o], immutable);
+ if (marked) {
+ toAnalyzeTensors.insert(iter->outData[o]);
+ }
+ }
+ }
+ }
+ if (_initbf16.find(iter->type) != _initbf16.end()) {
+ // verify if input activation tensor is not bf16 - add to toAnalyzeTensors as well
+ // we are assuming here that _initbf16 contain only layers having one dynamic input
+ // in other case algorithm should be changed to care about two dynamic input tensors
+ // and take into account case of different precision if they are
+ if (iter->insData[0].lock()->getTensorDesc().getPrecision() != Precision::BF16) {
+ toAnalyzeTensors.insert(iter->insData[0].lock());
+ // output tensor for FP32 convolutoin/FC layers should be FP32 as well
+ for (size_t o = 0; o < iter->outData.size(); o++) {
+ if (iter->outData[o]->getPrecision() == Precision::BF16) {
+ bool marked = tryToMarkFP32(iter->outData[o], immutable);
+ if (marked) {
+ toAnalyzeTensors.insert(iter->outData[o]);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // 3 - while toAnalyzeTensors is not empty look at the layers dealing with tensors mentioned in toAnalyzeTensors
+ while (!toAnalyzeTensors.empty()) {
+ DataPtr tensor = *toAnalyzeTensors.begin();
+ toAnalyzeTensors.erase(tensor);
+ // look into producer of the tensor
+ auto layer = tensor->getCreatorLayer().lock();
+ // if this layer is not from _initbf16 - analyze inputs
+ if (_initbf16.find(layer->type) == _initbf16.end()) {
+ // for all inputs investigate and modify tensor precision if required
+ for (size_t i = 0; i < layer->insData.size(); i++) {
+ bool marked = tryToMarkFP32(layer->insData[i].lock(), immutable);
+ if (marked) {
+ toAnalyzeTensors.insert(layer->insData[i].lock());
+ }
+ }
+ }
+
+ // mark all produced tensors to FP32 if they are BF16 and if they do not go _only_ to the toAnalyzeTensors
+ // TODO: when we enable greedy mode and start to produce bf16 tensor even if one consumer accepts it,
+ // this place should be changed.
+ // Instead of "if they do not go _only_ to the toAnalyzeTensors" we have to apply "if they do not go at least to one of _initbf16"
+ // TODO: add test input1->pooling1->conv1 and the same pooling1->relu. for example. now convolution should be returned to fp32
+ // after greedy mode, it should be fp32.
+ for (auto inputTo : tensor->getInputTo()) {
+ for (size_t o = 0; o < inputTo.second->outData.size(); o++) {
+ if (inputTo.second->outData[o]->getTensorDesc().getPrecision() == Precision::BF16) {
+ bool marked = tryToMarkFP32(inputTo.second->outData[o], immutable);
+ if (marked) {
+ toAnalyzeTensors.insert(layer->outData[o]);
+ }
+ }
+ }
+ }
+ }
+
+#ifndef NDEBUG
+ {
+ std::ofstream file("bf16_icnnnetwork.dot");
+ saveGraphToDot(network, file, precisionColoringBF16);
+ }
+#endif
+}
+
+bool BF16Transformer::tryToMarkFP32(InferenceEngine::DataPtr data, const std::set<InferenceEngine::DataPtr>& immutable) {
+ bool marked = false;
+ if (immutable.find(data) == immutable.end() && data->getPrecision() == Precision::BF16) {
+ // we treat one consumer and many in different ways
+ // if there is one consumer, we can mark its input as float if it does not belong to the list of initial layers
+ // in other cases we need to mark tensor which is passed to several l ayers as FP32 only if there is at least one conusmer
+ // produces data in FP32. I.e. there should be a way fo getting FP32 from output data to this point
+ if (data->getInputTo().size() == 1) {
+ if (_initbf16.find(data->getInputTo().begin()->second->type) == _initbf16.end()) {
+ marked = true;
+ }
+ } else {
+ // get all consumers
+ for (auto o : data->getInputTo()) {
+ // if tensor goes to several layers, we will mark it by FP32 only if one of the layer is unknown
+ if (_initbf16.find(o.second->type) == _initbf16.end() &&
+ _complementbf16.find(o.second->type) == _complementbf16.end() &&
+ _multiinput.find(o.second->type) == _multiinput.end()) {
+ marked = true;
+ }
+ }
+ }
+ if (marked) {
+ data->setPrecision(Precision::FP32);
+ }
+ }
+ return marked;
+}
+
+InferenceEngine::MemoryBlob::Ptr BF16Transformer::convertBF16ToFloat(InferenceEngine::MemoryBlob::Ptr tweights) {
+ TensorDesc td(Precision::FP32, tweights->getTensorDesc().getDims(), tweights->getTensorDesc().getLayout());
+ MemoryBlob::Ptr weightsFP32 = make_shared_blob<float>(td);
+ weightsFP32->allocate();
+ auto lmbf16 = tweights->rmap();
+ short *bf16data = lmbf16.as<short *>();
+ auto lmfp32 = weightsFP32->wmap();
+ float *fp32data = lmfp32.as<float *>();
+ for (size_t i = 0; i < weightsFP32->size(); i++) {
+ fp32data[i] = ngraph::bfloat16::from_bits(bf16data[i]);
+ }
+ return weightsFP32;
+}
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <details/caseless.hpp>
+#include <string>
+#include <set>
+#include "inference_engine.hpp"
+
+namespace MKLDNNPlugin {
+
+class BF16Transformer {
+ const InferenceEngine::details::caseless_set<std::string> _initbf16 =
+ { "convolution", "fullyconnected", "innerproduct" };
+ const InferenceEngine::details::caseless_set<std::string> _complementbf16 =
+ { "relu", "pooling", "norm", "gather" };
+ const InferenceEngine::details::caseless_set<std::string> _multiinput =
+ { "concat", "eltwise" };
+
+ /**
+ * Tries to mark tensor as FP32 by analyzing of local consumers of the tensor. Do not mark if
+ *
+ * 1. tensor goes to init layer (conv of fc)
+ * 2. goes to the layers which can work with BF16
+ *
+ * if tensor goes to layer not supporting BF16, this tensor will be marked as FP32
+ */
+ bool tryToMarkFP32(InferenceEngine::DataPtr data, const std::set<InferenceEngine::DataPtr> &immutable);
+
+public:
+ /**
+ * Restores Float point data types on edges which goes to non supported layers
+ *
+ * Algo:
+ * 1. Verify if we do not have bf16 tensors it's better to return early and not to try to return
+ * anything since there is no such tensors
+ * 2a. go over all inputs and outputs and if data type is not BF16, put them to the toAnalyzeTensors
+ * 2b. go over all unknown layers for this algo and mark them as fp32 and add their inputs and
+ * outputs to the toAnalyzeTensors and try to mark them as FP32
+ * 2c. go over all inputs to _initbf16 and if they are fp32 add them to the toAnalyzeTensors
+ *
+ * 3 - while toAnalyzeTensors is not empty look at the layers dealing with tensors mentioned in
+ * toAnalyzeTensors, analyze parent and children and depending on the type of the layers try to
+ * extend FP32 data type
+ */
+ void optimizeToFloat(InferenceEngine::CNNNetwork &network);
+
+ /**
+ * Converts all edges from bfloat16 to float data type. Do not touch input and output nodes
+ */
+ void convertToFloat(InferenceEngine::CNNNetwork &network);
+
+ /**
+ * converts all fp32 edges excepting inputs and outputs to bf16 and call restoreFloatPrecision
+ */
+ void convertToBFloat16(InferenceEngine::CNNNetwork &network);
+
+ InferenceEngine::MemoryBlob::Ptr convertBF16ToFloat(InferenceEngine::MemoryBlob::Ptr);
+};
+
+} // namespace MKLDNNPlugin
dumpQuantizedGraphToDot = val;
} else if (key.compare(PluginConfigParams::KEY_DUMP_QUANTIZED_GRAPH_AS_IR) == 0) {
dumpQuantizedGraphToIr = val;
+ } else if (key == PluginConfigParams::KEY_ENFORCE_BF16) {
+ if (val == PluginConfigParams::YES) enforceBF16 = true;
+ else if (val == PluginConfigParams::NO) enforceBF16 = false;
+ else
+ THROW_IE_EXCEPTION << "Wrong value for property key " << PluginConfigParams::KEY_ENFORCE_BF16
+ << ". Expected only YES/NO";
} else {
THROW_IE_EXCEPTION << NOT_FOUND_str << "Unsupported property " << key << " by CPU plugin";
}
_config.insert({ PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS, std::to_string(streamExecutorConfig._streams) });
_config.insert({ PluginConfigParams::KEY_CPU_THREADS_NUM, std::to_string(streamExecutorConfig._threads) });
_config.insert({ PluginConfigParams::KEY_DUMP_EXEC_GRAPH_AS_DOT, dumpToDot });
+ if (enforceBF16)
+ _config.insert({ PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::YES });
+ else
+ _config.insert({ PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::NO });
}
}
std::string dumpQuantizedGraphToDot = "";
std::string dumpQuantizedGraphToIr = "";
int batchLimit = 0;
+ bool enforceBF16 = false;
InferenceEngine::IStreamsExecutor::Config streamExecutorConfig;
#if defined(__arm__) || defined(__aarch64__)
#include "mkldnn_async_infer_request.h"
#include "mkldnn_infer_request.h"
#include "mkldnn_memory_state.h"
+#include "bf16transformer.h"
#include <ie_util_internal.hpp>
#include <graph_tools.hpp>
#include <cnn_network_int8_normalizer.hpp>
LayerTransformation::Params(params).setPrecisionsOnActivations({ Precision::U8 }),
"ScaleShift"));
transformer.transform(*_clonedNetwork);
+ if (with_cpu_x86_bfloat16()) {
+ BF16Transformer bf16Transformer;
+ CNNNetwork cnnetwork(_clonedNetwork);
+ if (cfg.enforceBF16 == true) {
+ bf16Transformer.convertToBFloat16(cnnetwork);
+ } else {
+ bf16Transformer.optimizeToFloat(cnnetwork);
+ }
+ } else {
+ BF16Transformer bf16Transformer;
+ CNNNetwork cnnetwork(_clonedNetwork);
+ bf16Transformer.convertToFloat(cnnetwork);
+ }
}
}
return 4;
case mkldnn::memory::data_type::s16:
return 2;
+ case mkldnn::memory::data_type::bf16:
+ return 2;
case mkldnn::memory::data_type::s8:
return 1;
case mkldnn::memory::data_type::u8:
return memory::s32;
case InferenceEngine::Precision::I16:
return memory::s16;
+ case InferenceEngine::Precision::BF16:
+ return memory::bf16;
case InferenceEngine::Precision::I8:
return memory::s8;
case InferenceEngine::Precision::U8:
return InferenceEngine::Precision::I32;
case memory::s16:
return InferenceEngine::Precision::I16;
+ case memory::bf16:
+ return InferenceEngine::Precision::BF16;
case memory::s8:
return InferenceEngine::Precision::I8;
case memory::u8:
return activationNode &&
(activationNode->getAlgorithm() == eltwise_relu ||
(conv->getCnnLayer()->precision == Precision::FP32 &&
+ conv->getCnnLayer()->insData[0].lock()->getPrecision() != Precision::BF16 &&
isOneOf(activationNode->getAlgorithm(), {eltwise_elu, eltwise_logistic, eltwise_bounded_relu, eltwise_clamp, eltwise_swish})));
};
auto isSutableParentNode = [](MKLDNNNodePtr node) {
return node->getType() == FullyConnected &&
+ node->getCnnLayer()->insData[0].lock()->getPrecision() != Precision::BF16 &&
node->getChildEdges().size() == 1;
};
bool isSutableConv = (node->getType() == Convolution) &&
node->getCnnLayer()->precision == Precision::FP32;
bool isSutableBinConv = node->getType() == BinaryConvolution;
- return (isSutableConv || isSutableBinConv) && node->getChildEdges().size() == 1;
+ return (isSutableConv || isSutableBinConv) && node->getChildEdges().size() == 1 &&
+ !(node->getCnnLayer()->insData[0].lock()->getPrecision() == Precision::BF16 &&
+ node->getCnnLayer()->outData[0]->getPrecision() == Precision::FP32);
};
auto isSutableChildNode = [](MKLDNNNodePtr node) {
auto isSutableParentNode = [](MKLDNNNodePtr node) {
return node->getType() == Convolution &&
node->getChildEdges().size() == 1 &&
- node->getCnnLayer()->precision == Precision::FP32;
+ node->getCnnLayer()->precision == Precision::FP32 &&
+ !(node->getCnnLayer()->insData[0].lock()->getPrecision() == Precision::BF16 &&
+ node->getCnnLayer()->outData[0]->getPrecision() == Precision::FP32);
};
auto isSutableChildNode = [&](MKLDNNNodePtr node) {
graph.DropNode(parent);
}
}
-}
\ No newline at end of file
+}
mkldnn::reorder reorderPrim(memory.GetPrimitive(), GetPrimitive());
mkldnn::stream(stream::kind::eager).submit({reorderPrim});
- if (ftz && memory.GetDataType() == mkldnn::memory::f32 && GetFormat() != mkldnn::memory::wino_fmt) {
+ if (ftz && memory.GetDataType() == mkldnn::memory::f32 && GetFormat() != mkldnn::memory::wino_fmt &&
+ GetDataType() != mkldnn::memory::bf16) {
// Internal blobs haven't strides yet.
auto *memData = static_cast<float *>(GetData());
memData += prim->get_primitive_desc().desc().data.layout_desc.blocking.offset_padding;
case mkldnn_bin:
precision = Precision::BIN;
break;
+ case mkldnn_bf16:
+ precision = Precision::BF16;
+ break;
default:
THROW_IE_EXCEPTION << "Cannot cast to TensorDesc. Unsupported precision!";
}
case Precision::BOOL:
data_type = mkldnn::memory::data_type::u8;
break;
+ case Precision::BF16:
+ data_type = mkldnn::memory::data_type::bf16;
+ break;
default:
THROW_IE_EXCEPTION << "Cannot create MKLDNNMemoryDesc from TensorDesc. Unsupported precision!";
}
internalBlob = InferenceEngine::make_shared_blob<int8_t>(desc);
} else if (blb->getTensorDesc().getPrecision() == Precision::I32) {
internalBlob = InferenceEngine::make_shared_blob<int32_t>(desc);
+ } else if (blb->getTensorDesc().getPrecision() == Precision::BF16) {
+ internalBlob = InferenceEngine::make_shared_blob<int16_t>(desc);
} else {
internalBlob = InferenceEngine::make_shared_blob<float>(desc);
}
#include <ie_system_conf.h>
#include <generic_ie.hpp>
-#include "cnn_network_ngraph_impl.hpp"
#include "convert_function_to_cnn_network.hpp"
#include <transformations/convert_opset1_to_legacy/convert_opset1_to_legacy.hpp>
#include <transformations/convert_opset2_to_opset1/convert_opset2_to_opset1.hpp>
std::shared_ptr<ICNNNetwork> clonedNetwork(nullptr);
- if (auto networkNGraph = dynamic_cast<const CNNNetworkNGraphImpl*>(&network)) {
- auto nGraphNetwork = networkNGraph->cloneNGraphImpl();
- if (!nGraphNetwork->getFunction()) {
- clonedNetwork = nGraphNetwork->getCNNNetwork();
- } else {
- const auto transformations_callback = [](const std::shared_ptr<const ::ngraph::Node> &node) -> bool {
- return std::dynamic_pointer_cast<const ::ngraph::opset2::Gelu>(node) ||
- std::dynamic_pointer_cast<const ::ngraph::opset2::BatchToSpace>(node) ||
- std::dynamic_pointer_cast<const ::ngraph::opset2::SpaceToBatch>(node);
- };
- // Disable shape inference (WA for generic operations)
- ::ngraph::op::GenericIE::DisableReshape noReshape(nGraphNetwork->getFunction());
-
- // Note: instead of running all Conversion Transformations you can make up your own transformation pipeline
- ngraph::pass::ConvertOpSet2ToOpSet1(transformations_callback).run_on_function(nGraphNetwork->getFunction());
- ngraph::pass::ConvertOpSet1ToLegacy(transformations_callback).run_on_function(nGraphNetwork->getFunction());
- clonedNetwork = InferenceEngine::details::convertFunctionToICNNNetwork(nGraphNetwork->getFunction(), *nGraphNetwork.get());
- }
+ if (network.getFunction()) {
+ const auto transformations_callback = [](const std::shared_ptr<const ::ngraph::Node> &node) -> bool {
+ return std::dynamic_pointer_cast<const ::ngraph::opset2::Gelu>(node) ||
+ std::dynamic_pointer_cast<const ::ngraph::opset2::BatchToSpace>(node) ||
+ std::dynamic_pointer_cast<const ::ngraph::opset2::SpaceToBatch>(node);
+ };
+ CNNNetwork net(network.getFunction());
+ auto nGraphFunc = net.getFunction();
+ // Disable shape inference (WA for generic operations)
+ ::ngraph::op::GenericIE::DisableReshape noReshape(nGraphFunc);
+
+ // Note: instead of running all Conversion Transformations you can make up your own transformation pipeline
+ ngraph::pass::ConvertOpSet2ToOpSet1(transformations_callback).run_on_function(nGraphFunc);
+ ngraph::pass::ConvertOpSet1ToLegacy(transformations_callback).run_on_function(nGraphFunc);
+ clonedNetwork = InferenceEngine::details::convertFunctionToICNNNetwork(nGraphFunc, network);
} else {
clonedNetwork = cloneNet(network);
}
}
// All extension layers support only FP32 precision!
+ // fixing of BF16 precisions where they are - layers naturally support only FP32
+ // if we see BF16, that means another floating point format which will be converted by reorder
+ // added by current mkl-dnn cpu plugin when it figure out diff in data types on input and output of edges
InferenceEngine::Precision precision = data_desc.getPrecision();
+ if (precision == Precision::BF16) {
+ precision = Precision::FP32;
+ }
if (conf.layout == ConfLayout::ANY) {
dataConfig.desc = TensorDesc(precision, data_dims, InferenceEngine::Layout::ANY);
} else {
THROW_IE_EXCEPTION << "Interp supports only 4d blobs!";
auto src_precision = inData->getTensorDesc().getPrecision();
- if (src_precision != Precision::FP32 && src_precision != Precision::U8)
- THROW_IE_EXCEPTION << layer->name << " Incorrect input data tensor precision. Only U8 or FP32 are supported!";
+ if (src_precision != Precision::FP32 && src_precision != Precision::U8 && src_precision != Precision::BF16)
+ THROW_IE_EXCEPTION << layer->name << " Incorrect input data tensor precision. Only U8 or FP32 or BF16 are supported!";
- if (layer->outData[0]->getTensorDesc().getPrecision() != Precision::FP32)
- THROW_IE_EXCEPTION << layer->name << " Incorrect output data tensor precision. Only FP32 is supported!";
+ auto dst_precision = layer->outData[0]->getTensorDesc().getPrecision();
+ if (dst_precision != Precision::FP32 && dst_precision != Precision::BF16)
+ THROW_IE_EXCEPTION << layer->name << " Incorrect output data tensor precision. Only FP32 or BF16 are supported!";
// We don't read other parameters since they are needed only for dst reshape in caffe
pad_beg = layer->GetParamAsInt("pad_beg");
if (mayiuse(avx512_common)) {
blk_layout = ConfLayout::BLK16;
interp_kernel.reset(new jit_uni_interp_kernel_f32<avx512_common>());
+ addConfig(layer, { DataConfigurator(blk_layout) }, { DataConfigurator(blk_layout) });
} else if (mayiuse(avx2)) {
blk_layout = ConfLayout::BLK8;
interp_kernel.reset(new jit_uni_interp_kernel_f32<avx2>());
+ addConfig(layer, { DataConfigurator(blk_layout) }, { DataConfigurator(blk_layout) });
} else {
blk_layout = ConfLayout::BLK8;
interp_kernel.reset(new jit_uni_interp_kernel_f32<sse42>());
+ addConfig(layer, { DataConfigurator(blk_layout) }, { DataConfigurator(blk_layout) });
}
- addConfig(layer, { DataConfigurator(blk_layout) }, { DataConfigurator(blk_layout) });
}
} catch (InferenceEngine::details::InferenceEngineException &ex) {
errorMsg = ex.what();
case Precision::FP32:
{
const float* src_data = inputs[0]->cbuffer().as<const float *>() + inputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
- size_t IC = inputs[0]->getTensorDesc().getBlockingDesc().getBlockDims()[1] *
- inputs[0]->getTensorDesc().getBlockingDesc().getBlockDims()[4];
+ size_t IC = (inputs[0]->getTensorDesc().getLayout() == Layout::BLOCKED)
+ ? inputs[0]->getTensorDesc().getBlockingDesc().getBlockDims()[1] *
+ inputs[0]->getTensorDesc().getBlockingDesc().getBlockDims()[4]
+ : IC = inputs[0]->getTensorDesc().getDims()[1];
interpolate(IN, IC, src_data,
-pad_beg, -pad_beg, IH_pad, IW_pad, IH, IW, dst_data, 0, 0, OH, OW, OH, OW);
}
}
int block_size = 1;
- if (mayiuse(avx512_common)) {
- block_size = 16;
- } else {
- block_size = 8;
+ if (interp_kernel) {
+ if (mayiuse(avx512_common)) {
+ block_size = 16;
+ } else {
+ block_size = 8;
+ }
}
// Align channel number to block size to deal with channels padding in IE with multiple blobs
float *pdst = pdst_h + w * block_size;
- arg.src00 = psrc00;
- arg.src01 = psrc01;
- arg.src10 = psrc10;
- arg.src11 = psrc11;
- arg.dst = pdst;
- arg.w_lambda0 = static_cast<float*>(&w_lambda0);
- arg.w_lambda1 = static_cast<float*>(&w_lambda1);
- (*interp_kernel)(&arg);
+ if (interp_kernel) {
+ arg.src00 = psrc00;
+ arg.src01 = psrc01;
+ arg.src10 = psrc10;
+ arg.src11 = psrc11;
+ arg.dst = pdst;
+ arg.w_lambda0 = static_cast<float*>(&w_lambda0);
+ arg.w_lambda1 = static_cast<float*>(&w_lambda1);
+ (*interp_kernel)(&arg);
+ } else {
+ for (int c = 0; c < block_size; ++c) {
+ pdst[c] = h_lambda1 * (w_lambda1 * psrc00[c] + w_lambda0 * psrc01[c]) +
+ h_lambda0 * (w_lambda1 * psrc10[c] + w_lambda0 * psrc11[c]);
+ }
+ }
}
});
}
}
}
+InferenceEngine::Precision MKLDNNConvolutionNode::fusedEltwisePrecision(MKLDNNEltwiseNode *eltwiseNode, int findex) {
+ InferenceEngine::Precision eltwisePrecision;
+ auto parent0 = eltwiseNode->getCnnLayer()->insData[0].lock()->getCreatorLayer().lock();
+ auto parent1 = eltwiseNode->getCnnLayer()->insData[1].lock()->getCreatorLayer().lock();
+
+ auto fusedParent = findex != 0 ? fusedWith[findex - 1].get()->getCnnLayer() : this->getCnnLayer();
+ eltwisePrecision = fusedParent == parent0 ? eltwiseNode->getCnnLayer()->insData[1].lock()->getPrecision() :
+ eltwiseNode->getCnnLayer()->insData[0].lock()->getPrecision();
+ return eltwisePrecision;
+}
+
void MKLDNNConvolutionNode::getSupportedDescriptors() {
if (!descs.empty())
return;
// We need to make sure that convolution output and second input of fused Eltwise operation
// have equal precision sizes since they use the same physical memory. In case precisions are different we upscale to FP32.
- if (outputDataType != memory::f32 && isFusedWith(Eltwise)) {
+ if (outputDataType != memory::f32 && outputDataType != memory::bf16 && isFusedWith(Eltwise)) {
for (int i = 0; i < fusedWith.size(); i++) {
auto *eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
if (eltwiseNode) {
- auto parent0 = eltwiseNode->getCnnLayer()->insData[0].lock()->getCreatorLayer().lock();
- auto parent1 = eltwiseNode->getCnnLayer()->insData[1].lock()->getCreatorLayer().lock();
-
- auto fusedParent = i != 0 ? fusedWith[i-1].get()->getCnnLayer() : this->getCnnLayer();
- eltwisePrecision = fusedParent == parent0 ? eltwiseNode->getCnnLayer()->insData[1].lock()->getPrecision() :
- eltwiseNode->getCnnLayer()->insData[0].lock()->getPrecision();
-
+ eltwisePrecision = fusedEltwisePrecision(eltwiseNode, i);
if (MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType).size() != eltwisePrecision.size()) {
eltwisePrecision = Precision::FP32;
outputDataType = memory::f32;
getParentEdgeAt(0)->getDims().ndims() == 5 ? memory::ndhwc : memory::nhwc);
createDescriptor({in_candidate}, {out_candidate});
} else {
- // If the weights aren't quantized, the only precision we support is FP32
- inputDataType = memory::f32;
- outputDataType = memory::f32;
+ inputDataType = convLayer->input()->getPrecision() == Precision::BF16 ? memory::bf16 : memory::f32;
+ outputDataType = convLayer->outData[0]->getPrecision() == Precision::BF16 ? memory::bf16 : memory::f32;
eltwisePrecision = Precision::FP32;
+ for (int i = 0; i < fusedWith.size(); i++) {
+ auto *eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
+ if (eltwiseNode) {
+ eltwisePrecision = fusedEltwisePrecision(eltwiseNode, i);
+ // TODO(amalyshe): there might be situation when convolution can be executed in BF16,
+ // output is required in FP32 but eltwise inplace tensor would be in BF16
+ // currently we forcedly change output to the BF16 that will add reoreder after the node
+ // Another situation can be when we mark output as FP32 and Eltwise asPrecison (which stand
+ // for input of inplace tensor precision) to FP32. This will add reorder for that in-place tensor
+ // bofore the fused convolution. This behaviour might be more correct regarding expected markup
+ // of the graph but performance of first and second approaches might be different. Need to verify
+ outputDataType = eltwisePrecision == Precision::BF16 ? memory::bf16 : memory::f32;
+ }
+ }
+ // correction for cases of FP32 input - we do not have FP32 convolution supported BF16 output
+ if (inputDataType == memory::f32
+ && (outputDataType == memory::bf16 || eltwisePrecision == Precision::BF16)) {
+ outputDataType = memory::f32;
+ eltwisePrecision = Precision::FP32;
+ }
Layout layout = convLayer->input()->getLayout();
mkldnn::memory::data_type wdt = precisionToDataType(inDesc.getPrecision());
mkldnn::memory::data_type bdt = precisionToDataType(inDesc.getPrecision());
+ if (inDesc.getPrecision() == Precision::BF16) {
+ bdt = mkldnn::memory::data_type::f32;
+ }
if (inDesc.getPrecision() == Precision::U8 || inDesc.getPrecision() == Precision::I8) {
wdt = memory::s8;
// Works only for FP32 convolutions for now.
bool isStridedBlobsSupported = true;
for (auto &insData : getCnnLayer()->insData) {
- if (insData.lock()->getPrecision() != InferenceEngine::Precision::FP32) {
+ if (insData.lock()->getPrecision() != InferenceEngine::Precision::FP32
+ && insData.lock()->getPrecision() != InferenceEngine::Precision::BF16) {
isStridedBlobsSupported = false;
break;
}
namespace MKLDNNPlugin {
+class MKLDNNEltwiseNode;
+
class MKLDNNConvolutionNode : public MKLDNNNode {
public:
MKLDNNConvolutionNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, int socket);
protected:
void addScaleToPrimitiveAttr(mkldnn::primitive_attr attr) const;
+ InferenceEngine::Precision fusedEltwisePrecision(MKLDNNEltwiseNode *eltwiseNode, int findex);
private:
mkldnn::memory::data_type precisionToDataType(InferenceEngine::Precision prec);
inputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(in_prec);
}
+ if (inputDT == memory::bf16 || outputDT == memory::bf16) {
+ inputDT = memory::f32;
+ outputDT = memory::f32;
+ }
+
auto impl_desc = initDesc(inputDT, outputDT, format);
if (impl_desc.getImplementationType() != impl_desc_type::undef) {
}
auto weightsDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(getCnnLayer()->insData[1].lock()->getPrecision());
- if (inputDataType != memory::u8 || weightsDataType != memory::s8) {
+ // TODO(amalyse) what are the cases when we have non i8 weights and have to overide the precisions?
+ if ((inputDataType != memory::u8 || weightsDataType != memory::s8) && inputDataType != memory::bf16) {
inputDataType = memory::f32;
outputDataType = memory::f32;
}
TensorDesc inDesc = inputDesc[0], outDesc = outputDesc[0];
mkldnn::memory::data_type wdt = MKLDNNExtensionUtils::IEPrecisionToDataType(inDesc.getPrecision());
mkldnn::memory::data_type bdt = MKLDNNExtensionUtils::IEPrecisionToDataType(inDesc.getPrecision());
+ if (inDesc.getPrecision() == Precision::BF16) {
+ bdt = mkldnn::memory::data_type::f32;
+ }
if (inDesc.getPrecision() == Precision::U8 || inDesc.getPrecision() == Precision::I8) {
wdt = memory::s8;
memory::format outFormat = mkldnn::memory::format_undef;
if (getType() == Input || getType() == MemoryInput) {
precision = getCnnLayer()->outData[0]->getPrecision();
- if (precision == InferenceEngine::Precision::U16 || isMeanImage)
+ if (precision == InferenceEngine::Precision::U16 || isMeanImage) {
precision = InferenceEngine::Precision::FP32;
+ }
auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
InferenceEngine::DataConfig dataConfig;
dataConfig.inPlace = -1;
if (!descs.empty())
return;
InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
- if (precision != InferenceEngine::Precision::FP32)
+ if (precision != InferenceEngine::Precision::FP32 && precision != InferenceEngine::Precision::BF16)
precision = InferenceEngine::Precision::FP32;
auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
auto * lrnLayer = dynamic_cast<NormLayer*>(getCnnLayer().get());
inputPrecision = getCnnLayer()->insData[0].lock()->getPrecision();
outputPrecision = getCnnLayer()->outData[0]->getPrecision();
// Dirty WA to support stat based quantization approach
- if (this->getCnnLayer()->precision != Precision::I8) {
+ if (this->getCnnLayer()->precision != Precision::I8
+ && inputPrecision != Precision::BF16) {
if (type == PoolingLayer::MAX) {
// MKLDNN supports only equal precisions for input and output
outputPrecision = inputPrecision;
outputPrecision = Precision::FP32;
}
}
+ if (inputPrecision == Precision::BF16) {
+ outputPrecision = inputPrecision;
+ }
if (!fusedWith.empty()) {
auto lastFusedLayer = fusedWith[fusedWith.size() - 1].get()->getCnnLayer();
MKLDNNMemoryDesc in_candidate{parentDims, inputDataType, parentDims.ndims() == 5 ? memory::format::ndhwc : memory::format::nhwc};
MKLDNNMemoryDesc out_candidate{childDims, outputDataType, parentDims.ndims() == 5 ? memory::format::ndhwc : memory::format::nhwc};
createDescriptor({ in_candidate }, { out_candidate });
+ } else if ((parentDims.ndims() == 4 || parentDims.ndims() == 5) && (inputDataType == memory::bf16 || outputDataType == memory::bf16)) {
+ MKLDNNMemoryDesc in_candidate{ parentDims, memory::bf16, parentDims.ndims() == 5 ? memory::format::nCdhw16c : memory::format::nChw16c};
+ MKLDNNMemoryDesc out_candidate{ childDims, memory::bf16, parentDims.ndims() == 5 ? memory::format::nCdhw16c : memory::format::nChw16c};
+ createDescriptor({ in_candidate }, { out_candidate });
} else if ((parentDims.ndims() == 4 || parentDims.ndims() == 5) && parentDims[1] == 1) {
inputDataType = memory::f32;
outputDataType = memory::f32;
MKLDNNMemoryDesc out_candidate{childDims, outputDataType, parentDims.ndims() == 5 ? memory::format::ncdhw : memory::format::nchw};
createDescriptor({ in_candidate }, { out_candidate });
} else {
- inputDataType = memory::f32;
- outputDataType = memory::f32;
+ if (inputDataType != memory::bf16) {
+ inputDataType = memory::f32;
+ outputDataType = memory::f32;
+ }
// It doesn't support any format
for (auto format : getAvailableFormatsForDims(parentDims)) {
MKLDNNMemoryDesc in_candidate{parentDims, inputDataType, format};
#include <memory>
#include <string>
#include <vector>
+#include "bf16transformer.h"
using namespace mkldnn::impl::cpu;
using namespace mkldnn::impl::utils;
THROW_IE_EXCEPTION << "Normalize supports from 2D to 4D blobs!";
}
- weights = std::dynamic_pointer_cast<TBlob<float>>(layer->blobs.at("weights"));
- if (!weights)
- THROW_IE_EXCEPTION << layer->name << " weights is empty!";
+ MemoryBlob::Ptr tweights = as<MemoryBlob>(layer->blobs.at("weights"));
+ if (!tweights) {
+ THROW_IE_EXCEPTION << layer->name << "Weights are not initialized or cannot be casted to MemoryBlob for layer Normalize with name '"
+ << layer->name << "'";
+ }
+
+ if (tweights->getTensorDesc().getPrecision() == Precision::FP32) {
+ weights = tweights;
+ } else if (tweights->getTensorDesc().getPrecision() == Precision::BF16) {
+ MKLDNNPlugin::BF16Transformer transformer;
+ weights = transformer.convertBF16ToFloat(tweights);
+ } else {
+ // Unknown non supported data type, return an error
+ THROW_IE_EXCEPTION << layer->name << "Weights for layer Normalize wiht name '" << layer->name <<
+ "' has unsupported data type " << tweights->getTensorDesc().getPrecision();
+ }
across_spatial = layer->GetParamAsBool("across_spatial", false);
channel_shared = layer->GetParamAsBool("channel_shared", false);
eps = layer->GetParamAsFloat("eps");
std::shared_ptr<jit_uni_normalize_across_spatial_kernel> normalize_across_spatial_kernel;
std::shared_ptr<jit_uni_sqr_sum_kernel> sqr_sum_kernel;
- TBlob<float>::Ptr weights;
+ MemoryBlob::Ptr weights;
bool across_spatial = true;
bool channel_shared = true;
float eps = 1e-10f;
if (layer->outData.size() != 1 && layer->outData.size() != 2)
THROW_IE_EXCEPTION << layer->name << " Incorrect number of output edges!";
- if (layer->insData[TOPK_DATA].lock()->getTensorDesc().getPrecision() != Precision::FP32 ||
+ // DataConfigurator::addConfig will automatically change BF16 datatype to FP32
+ // it can be changed back by explicit modification like confs.back().outConfs[i].desc.setPrecision(Precision::BF16);
+ // if current layer supports BF16 naturally. usually they are not and nothing special is not required
+ if ((layer->insData[TOPK_DATA].lock()->getTensorDesc().getPrecision() != Precision::FP32 &&
+ layer->insData[TOPK_DATA].lock()->getTensorDesc().getPrecision() != Precision::BF16) ||
layer->insData[TOPK_K].lock()->getTensorDesc().getPrecision() != Precision::I32)
- THROW_IE_EXCEPTION << layer->name << " Incorrect input data/index values precision.";
+ THROW_IE_EXCEPTION << layer->name << " TopKImpl - Incorrect input data/index values precision.";
if (layer->insData[TOPK_K].lock()->getTensorDesc().getDims().size() > 1)
- THROW_IE_EXCEPTION << layer->name << " Index vector should be 1 dimension";
+ THROW_IE_EXCEPTION << layer->name << " TopKImpl - Index vector should be 1 dimension";
SizeVector dst_dims = layer->outData[0]->getTensorDesc().getDims();
SizeVector src_data_dims = layer->insData[TOPK_DATA].lock()->getTensorDesc().getDims();
if (src_data_dims.size() != dst_dims.size())
- THROW_IE_EXCEPTION << layer->name << " Incorrect input/output tensor dimension sizes";
+ THROW_IE_EXCEPTION << layer->name << " TopKImpl - Incorrect input/output tensor dimension sizes";
if (layer->outData.size() == 2) {
- if (layer->outData[TOPK_VALUE]->getTensorDesc().getPrecision() != Precision::FP32)
- THROW_IE_EXCEPTION << layer->name << " Incorrect output data tensor precision. Only FP32 is supported!";
+ if (layer->outData[TOPK_VALUE]->getTensorDesc().getPrecision() != Precision::FP32 &&
+ layer->outData[TOPK_VALUE]->getTensorDesc().getPrecision() != Precision::BF16)
+ THROW_IE_EXCEPTION << layer->name << " TopKImpl - Incorrect output data tensor precision. Floating point datatypes are supported!";
SizeVector dst_idx_dims = layer->outData[TOPK_INDEX]->getTensorDesc().getDims();
if (dst_dims.size() != dst_idx_dims.size())
break;
}
case Precision::I16:
- case Precision::U16: {
- auto *pln_blob_ptr = pln_blob->buffer().as<int16_t*>();
+ case Precision::U16:
+ case Precision::BF16: {
+ auto *pln_blob_ptr = pln_blob->buffer().as<int16_t *>();
auto *blob_ptr = blob->buffer().as<int16_t *>();
- for (size_t i = 0; i < data_size; i++)
- pln_blob_ptr[i] = blob_ptr[blob_wrp.off_l(i)];
+ for (size_t i = 0; i < data_size; i++) pln_blob_ptr[i] = blob_ptr[blob_wrp.off_l(i)];
break;
}
case Precision::I8:
<< dims.size() << "D "
<< "shape: ";
for (size_t d : dims) stream << d << " ";
- stream << "(" << _blob->size() << ")" <<std::endl;
+ stream << "(" << _blob->size() << ")" <<
+ " by address 0x" << std::hex << _blob->buffer().as<long long>() << std::dec <<std::endl;
// Dump data
MKLDNNMemoryDesc mdesc(_blob->getTensorDesc());
stream << blob_ptr[blob_wrp.off_l(i)] << std::endl;
break;
}
+ case Precision::BF16:
+ {
+ auto *blob_ptr = _blob->buffer().as<int16_t *>();
+ for (size_t i = 0; i < data_size; i++) {
+ int i16n = blob_ptr[blob_wrp.off_l(i)];
+ i16n = i16n << 16;
+ float fn = *(reinterpret_cast<float *>(&i16n));
+ stream << fn << std::endl;
+ }
+ break;
+ }
case Precision::I32: {
auto *blob_ptr = _blob->buffer().as<int32_t*>();
for (size_t i = 0; i < data_size; i++)
USE_FACTORY(I64);
USE_FACTORY(U64);
USE_FACTORY(BIN);
+ USE_FACTORY(BF16);
USE_FACTORY(BOOL);
default:
THROW_IE_EXCEPTION << "cannot locate blob for precision: " << precision;
* @snippet example_async_infer_request.cpp async_infer_request:define_pipeline
*/
class AsyncInferRequestThreadSafeDefault : public AsyncInferRequestThreadSafeInternal {
+ using AtomicCallback = std::atomic<IInferRequest::CompletionCallback>;
+ using Futures = std::vector<std::shared_future<void>>;
+ using Promise = std::shared_ptr<std::promise<void>>;
+ enum Stage_e : std::uint8_t { executor, task };
+ struct DisableCallbackGuard{
+ explicit DisableCallbackGuard(AtomicCallback& callback)
+ : _callbackRef(callback), _callback(callback.exchange(nullptr)) {}
+ ~DisableCallbackGuard() {
+ _callbackRef = _callback;
+ }
+ AtomicCallback& _callbackRef;
+ IInferRequest::CompletionCallback _callback;
+ };
+ InferRequestInternal::Ptr _syncRequest;
+
public:
/**
* @brief A shared pointer to AsyncInferRequestThreadSafeDefault
using Ptr = std::shared_ptr<AsyncInferRequestThreadSafeDefault>;
/**
- * @brief Wraps a InferRequestInternal::Ptr implementation and constructs a
+ * @brief Wraps a InferRequestInternal::Ptr implementation and constructs a
* AsyncInferRequestThreadSafeDefault::_pipeline where `taskExecutor` is used to run InferRequestInternal::Infer
* asynchronously.
*
AsyncInferRequestThreadSafeDefault(const InferRequestInternal::Ptr& request,
const ITaskExecutor::Ptr& taskExecutor,
const ITaskExecutor::Ptr& callbackExecutor)
- : _requestExecutor {taskExecutor},
+ : _syncRequest {request},
+ _requestExecutor {taskExecutor},
_callbackExecutor {callbackExecutor},
- _syncRequest {request} {
- _pipeline = {
- { _requestExecutor, [this] { _syncRequest->Infer(); } }
- };
+ _pipeline {{taskExecutor, [this] {_syncRequest->Infer();}}},
+ _syncPipeline{{std::make_shared<ImmediateExecutor>(), [this] {_syncRequest->Infer();}}} {
}
/**
* @brief Creates and run the first stage task. If destructor was not called add a new std::future to the
* AsyncInferRequestThreadSafeDefault::_futures list that would be used to wait
* AsyncInferRequestThreadSafeDefault::_pipeline finish
+ * @param[in] itBeginStage Iterator to begin of pipeline
+ * @param[in] itEndStage End pipeline iterator
+ * @param[in] callbackExecutor Final or error stage executor
*/
- void RunFirstStage() {
- _itStage = _pipeline.begin();
+ void RunFirstStage(const Pipeline::iterator itBeginStage, const Pipeline::iterator itEndStage,
+ const ITaskExecutor::Ptr callbackExecutor = {}) {
_promise = {};
bool stop = [&] {
std::lock_guard<std::mutex> lock(_mutex);
if (!stop) {
try {
- auto& firstStageExecutor = std::get<Stage_e::executor>(*_itStage);
+ auto& firstStageExecutor = std::get<Stage_e::executor>(*itBeginStage);
IE_ASSERT(nullptr != firstStageExecutor);
- firstStageExecutor->run(MakeNextStageTask());
+ firstStageExecutor->run(MakeNextStageTask(itBeginStage, itEndStage, std::move(callbackExecutor)));
} catch (...) {
_promise.set_exception(std::current_exception());
throw;
* @brief Implements Infer() using StartAsync() and Wait()
*/
void InferUsingAsync() {
- struct CallbackStorage {
- explicit CallbackStorage(AtomicCallback& callback)
- : _callbackRef(callback), _callback(callback.exchange(nullptr)) {}
- ~CallbackStorage() {
- _callbackRef = _callback;
- }
- AtomicCallback& _callbackRef;
- IInferRequest::CompletionCallback _callback;
- } storage {_callback};
+ DisableCallbackGuard disableCallbackGuard{_callback};
StartAsync_ThreadUnsafe();
Wait(InferenceEngine::IInferRequest::WaitMode::RESULT_READY);
}
- ITaskExecutor::Ptr _requestExecutor; //!< Used to run inference CPU tasks
- ITaskExecutor::Ptr _callbackExecutor; //!< Used to run post inference callback
+ /**
+ * @brief Implements Infer() using synchronous pipeline and Wait()
+ */
+ void InferUsingSync() {
+ DisableCallbackGuard disableCallbackGuard{_callback};
+ _syncRequest->checkBlobs();
+ RunFirstStage(_syncPipeline.begin(), _syncPipeline.end(), _syncCallbackExecutor);
+ Wait(InferenceEngine::IInferRequest::WaitMode::RESULT_READY);
+ }
+
+ ITaskExecutor::Ptr _requestExecutor; //!< Used to run inference CPU tasks.
+ ITaskExecutor::Ptr _callbackExecutor; //!< Used to run post inference callback in asynchronous pipline
+ ITaskExecutor::Ptr _syncCallbackExecutor; //!< Used to run post inference callback in synchronous pipline
Pipeline _pipeline; //!< Pipeline variable that should be filled by inherited class.
+ Pipeline _syncPipeline; //!< Synchronous pipeline variable that should be filled by inherited class.
void StartAsync_ThreadUnsafe() override {
_syncRequest->checkBlobs();
- RunFirstStage();
+ RunFirstStage(_pipeline.begin(), _pipeline.end(), _callbackExecutor);
}
void Infer_ThreadUnsafe() override {
- _syncRequest->checkBlobs();
- _syncRequest->InferImpl();
+ InferUsingSync();
}
void GetPerformanceCounts_ThreadUnsafe(std::map<std::string, InferenceEngineProfileInfo>& perfMap) const override {
}
private:
- using AtomicCallback = std::atomic<IInferRequest::CompletionCallback>;
- using Futures = std::vector<std::shared_future<void>>;
- using Promise = std::shared_ptr<std::promise<void>>;
- enum Stage_e : std::uint8_t { executor, task };
-
/**
* @brief Create a task with next pipeline stage.
- * Each call to MakeNextStageTask() generates `InferenceEngine::Task` objects for each stage.
- * When stage task is called it increments
- * `_stage` counter, call `_pipeline` task for this stage and generates next stage task using
- * MakeNextStageTask() and pass it to executor. On last stage or if the exception is raised from `_pipeline` task
+ * Each call to MakeNextStageTask() generates @ref Task objects for each stage.
+ * On last stage or if the exception is raised from `_pipeline` task
* the last stage task is called or passed to callback executor if it is presented. The last stage task call the
* callback, if it is presented, capture the `_promise` member and use it to forward completion or exception to the
* one of `_futures` member
+ * @param[in] itStage Iterator to next stage of pipeline
+ * @param[in] itEndStage End pipeline iterator
+ * @param[in] callbackExecutor Executor that will run final stage with callback call
* @return A next stage task
*/
- Task MakeNextStageTask() {
- return [this]() mutable {
+ Task MakeNextStageTask(const Pipeline::iterator itStage, const Pipeline::iterator itEndStage,
+ const ITaskExecutor::Ptr callbackExecutor) {
+ return std::bind([this, itStage, itEndStage](ITaskExecutor::Ptr& callbackExecutor) mutable {
StatusCode requestStatus = StatusCode::OK;
std::exception_ptr localCurrentException = nullptr;
- auto& thisStage = *_itStage;
- auto copyItStage = ++_itStage;
+ auto& thisStage = *itStage;
+ auto itNextStage = itStage + 1;
try {
auto& stageTask = std::get<Stage_e::task>(thisStage);
IE_ASSERT(nullptr != stageTask);
stageTask();
- if (_pipeline.end() != _itStage) {
- auto nextStage = *_itStage;
+ if (itEndStage != itNextStage) {
+ auto& nextStage = *itNextStage;
auto& nextStageExecutor = std::get<Stage_e::executor>(nextStage);
IE_ASSERT(nullptr != nextStageExecutor);
- nextStageExecutor->run(MakeNextStageTask());
+ nextStageExecutor->run(MakeNextStageTask(itNextStage, itEndStage, std::move(callbackExecutor)));
}
} catch (InferenceEngine::details::InferenceEngineException& ie_ex) {
requestStatus = ie_ex.hasStatus() ? ie_ex.getStatus() : StatusCode::GENERAL_ERROR;
localCurrentException = std::current_exception();
}
- if ((_pipeline.end() == copyItStage) || (nullptr != localCurrentException)) {
+ if ((itEndStage == itNextStage) || (nullptr != localCurrentException)) {
auto lastStageTask = [this, requestStatus, localCurrentException]() mutable {
auto promise = std::move(_promise);
auto callback = _callback.load();
}
};
- if (nullptr == _callbackExecutor) {
+ if (nullptr == callbackExecutor) {
lastStageTask();
} else {
- _callbackExecutor->run(std::move(lastStageTask));
+ callbackExecutor->run(std::move(lastStageTask));
}
}
- };
+ }, std::move(callbackExecutor));
}
- InferRequestInternal::Ptr _syncRequest;
void* _userData = nullptr;
AtomicCallback _callback = {nullptr};
IInferRequest::Ptr _publicInterface;
- Pipeline::iterator _itStage;
std::promise<void> _promise;
mutable std::mutex _mutex;
Futures _futures;
*/
INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_avx512_core();
+/**
+ * @brief Checks whether CPU supports BFloat16 capability
+ * @ingroup ie_dev_api_system_conf
+ * @return `True` is tAVX512_BF16 instructions are available, `false` otherwise
+ */
+INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_bfloat16();
+
} // namespace InferenceEngine
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <unordered_map>
#include <vector>
#include <utility>
+#include <mutex>
#include "threading/ie_itask_executor.hpp"
#include "threading/ie_istreams_executor.hpp"
private:
std::unordered_map<std::string, ITaskExecutor::Ptr> executors;
std::vector<std::pair<IStreamsExecutor::Config, IStreamsExecutor::Ptr> > cpuStreamsExecutors;
+ std::mutex streamExecutorMutex;
+ std::mutex taskExecutorMutex;
};
/**
* @brief Returns a global instance of ExecutorManager
* @return The instance.
*/
- static ExecutorManager* getInstance() {
- if (!_instance) {
- _instance = new ExecutorManager();
- }
-
- return _instance;
- }
+ static ExecutorManager* getInstance();
/**
* @brief A deleted copy constructor
*/
size_t getExecutorsNumber();
- /**
- * @cond
- */
size_t getIdleCPUStreamsExecutorsNumber();
void clear(const std::string& id = {});
ExecutorManager() {}
ExecutorManagerImpl _impl;
- static ExecutorManager* _instance;
+
+ static std::mutex _mutex;
+ static ExecutorManager *_instance;
};
} // namespace InferenceEngine
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
# install
install(TARGETS ${TARGET_NAME}
- RUNTIME DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
- ARCHIVE DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
+ RUNTIME DESTINATION ${IE_CPACK_RUNTIME_PATH} COMPONENT core
+ ARCHIVE DESTINATION ${IE_CPACK_ARCHIVE_PATH} COMPONENT core
LIBRARY DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core)
}
//------------------------------------------------------------------------------
-namespace calcRowArea {
-// vertical pass
-template<typename T, typename A, typename I, typename W>
-static inline void downy(const T *src[], int inWidth, const MapperUnit<A, I>& ymap, A yalpha,
- W vbuf[]) {
- int y_1st = ymap.index0;
- int ylast = ymap.index1 - 1;
-
- // yratio > 1, so at least 2 rows
- GAPI_DbgAssert(y_1st < ylast);
-
- // 1st and last rows
- {
- int w = 0;
-
- #if CV_SIMD128
- if (std::is_same<T, uint8_t>::value) {
- for (; w <= inWidth - 8; w += 8) {
- v_uint16x8 vsrc0 = v_load_expand(reinterpret_cast<const uint8_t*>(& src[0][w]));
- v_uint16x8 vsrc1 = v_load_expand(reinterpret_cast<const uint8_t*>(& src[ylast - y_1st][w]));
- v_uint16x8 vres = v_mulhi(vsrc0 << 8, static_cast<Q0_16>(ymap.alpha0)) +
- v_mulhi(vsrc1 << 8, static_cast<Q0_16>(ymap.alpha1));
- v_store(reinterpret_cast<Q8_8*>(& vbuf[w]), vres);
- }
- }
- #endif
-
- for (; w < inWidth; w++) {
- vbuf[w] = mulas(ymap.alpha0, src[0][w])
- + mulas(ymap.alpha1, src[ylast - y_1st][w]);
- }
- }
-
- // inner rows (if any)
- for (int i = 1; i < ylast - y_1st; i++) {
- int w = 0;
-
- #if CV_SIMD128
- if (std::is_same<T, uint8_t>::value) {
- for (; w <= inWidth - 8; w += 8) {
- v_uint16x8 vsrc = v_load_expand(reinterpret_cast<const uint8_t*>(& src[i][w]));
- v_uint16x8 vres = v_load(reinterpret_cast<Q8_8*>(& vbuf[w]));
- vres = vres + v_mulhi(vsrc << 8, static_cast<Q0_16>(yalpha));
- v_store(reinterpret_cast<Q8_8*>(& vbuf[w]), vres);
- }
- }
- #endif
-
- for (; w < inWidth; w++) {
- vbuf[w] += mulas(yalpha, src[i][w]);
- }
- }
-}
-
-// horizontal pass
-template<typename T, typename A, typename I, typename W>
-static inline void downx(T dst[], int outWidth, int xmaxdf, const I xindex[], const A xalpha[],
- const W vbuf[]) {
-#define HSUM(xmaxdf) \
- for (int x = 0; x < outWidth; x++) { \
- int index = xindex[x]; \
- const A *alpha = &xalpha[x * xmaxdf]; \
-\
- W sum = 0; \
- for (int i = 0; i < xmaxdf; i++) { \
- sum += mulaw(alpha[i], vbuf[index + i]); \
- } \
-\
- dst[x] = convert_cast<T>(sum); \
- }
-
- if (2 == xmaxdf) {
- HSUM(2);
- } else if (3 == xmaxdf) {
- HSUM(3);
- } else if (4 == xmaxdf) {
- HSUM(4);
- } else if (5 == xmaxdf) {
- HSUM(5);
- } else if (6 == xmaxdf) {
- HSUM(6);
- } else if (7 == xmaxdf) {
- HSUM(7);
- } else if (8 == xmaxdf) {
- HSUM(8);
- } else {
- HSUM(xmaxdf);
- }
-#undef HSUM
-}
-} // namespace calcRowArea
-
-template<typename T, typename A, typename I, typename W>
-static void calcRowArea_impl_sse4(T dst[], const T *src[], const Size& inSz, const Size& outSz,
- A yalpha, const MapperUnit<A, I>& ymap, int xmaxdf, const I xindex[], const A xalpha[],
- W vbuf[]) {
- bool xRatioEq1 = inSz.width == outSz.width;
- bool yRatioEq1 = inSz.height == outSz.height;
-
- if (!yRatioEq1 && !xRatioEq1) {
- calcRowArea::downy(src, inSz.width, ymap, yalpha, vbuf);
- calcRowArea::downx(dst, outSz.width, xmaxdf, xindex, xalpha, vbuf);
-
- } else if (!yRatioEq1) {
- GAPI_DbgAssert(xRatioEq1);
- calcRowArea::downy(src, inSz.width, ymap, yalpha, vbuf);
- for (int x = 0; x < outSz.width; x++) {
- dst[x] = convert_cast<T>(vbuf[x]);
- }
-
- } else if (!xRatioEq1) {
- GAPI_DbgAssert(yRatioEq1);
- for (int w = 0; w < inSz.width; w++) {
- vbuf[w] = convert_cast<W>(src[0][w]);
- }
- calcRowArea::downx(dst, outSz.width, xmaxdf, xindex, xalpha, vbuf);
-
- } else {
- GAPI_DbgAssert(xRatioEq1 && yRatioEq1);
- memcpy(dst, src[0], outSz.width * sizeof(T));
- }
-}
void calcRowArea_8U(uchar dst[], const uchar *src[], const Size& inSz, const Size& outSz,
Q0_16 yalpha, const MapperUnit8U &ymap, int xmaxdf, const short xindex[], const Q0_16 xalpha[],
Q8_8 vbuf[]) {
- calcRowArea_impl_sse4(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
+ calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
}
void calcRowArea_32F(float dst[], const float *src[], const Size& inSz, const Size& outSz,
float yalpha, const MapperUnit32F& ymap, int xmaxdf, const int xindex[], const float xalpha[],
float vbuf[]) {
- calcRowArea_impl_sse4(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
+ calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
}
//------------------------------------------------------------------------------
auto dst = out.OutLine<T>(l);
- #ifdef HAVE_SSE
+ #ifdef HAVE_AVX512
+ if (with_cpu_x86_avx512f()) {
+ if (std::is_same<T, uchar>::value) {
+ avx512::calcRowArea_8U(reinterpret_cast<uchar*>(dst),
+ reinterpret_cast<const uchar**>(src),
+ inSz, outSz,
+ static_cast<Q0_16>(ymapper.alpha),
+ reinterpret_cast<const MapperUnit8U&>(ymap),
+ xmaxdf[0],
+ reinterpret_cast<const short*>(xindex),
+ reinterpret_cast<const Q0_16*>(xalpha),
+ reinterpret_cast<Q8_8*>(vbuf));
+ continue; // next l = 0, ..., lpi-1
+ }
+
+ if (std::is_same<T, float>::value) {
+ avx512::calcRowArea_32F(reinterpret_cast<float*>(dst),
+ reinterpret_cast<const float**>(src),
+ inSz, outSz,
+ static_cast<float>(ymapper.alpha),
+ reinterpret_cast<const MapperUnit32F&>(ymap),
+ xmaxdf[0],
+ reinterpret_cast<const int*>(xindex),
+ reinterpret_cast<const float*>(xalpha),
+ reinterpret_cast<float*>(vbuf));
+ continue;
+ }
+ }
+ #endif // HAVE_AVX512
+
+ #ifdef HAVE_AVX2
+ if (with_cpu_x86_avx2()) {
+ if (std::is_same<T, uchar>::value) {
+ avx::calcRowArea_8U(reinterpret_cast<uchar*>(dst),
+ reinterpret_cast<const uchar**>(src),
+ inSz, outSz,
+ static_cast<Q0_16>(ymapper.alpha),
+ reinterpret_cast<const MapperUnit8U&>(ymap),
+ xmaxdf[0],
+ reinterpret_cast<const short*>(xindex),
+ reinterpret_cast<const Q0_16*>(xalpha),
+ reinterpret_cast<Q8_8*>(vbuf));
+ continue; // next l = 0, ..., lpi-1
+ }
+
+ if (std::is_same<T, float>::value) {
+ avx::calcRowArea_32F(reinterpret_cast<float*>(dst),
+ reinterpret_cast<const float**>(src),
+ inSz, outSz,
+ static_cast<float>(ymapper.alpha),
+ reinterpret_cast<const MapperUnit32F&>(ymap),
+ xmaxdf[0],
+ reinterpret_cast<const int*>(xindex),
+ reinterpret_cast<const float*>(xalpha),
+ reinterpret_cast<float*>(vbuf));
+ continue;
+ }
+ }
+ #endif // HAVE_AVX2
+
+ #ifdef HAVE_SSE
if (with_cpu_x86_sse42()) {
if (std::is_same<T, uchar>::value) {
calcRowArea_8U(reinterpret_cast<uchar*>(dst),
continue;
}
}
- #endif // HAVE_SSE
+ #endif // HAVE_SSE
// vertical pass
int y_1st = ymap.index0;
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
#ifndef IE_PREPROCESS_GAPI_KERNELS_SIMD_IMPL_H
#define IE_PREPROCESS_GAPI_KERNELS_SIMD_IMPL_H
target_include_directories(${TARGET_NAME} PUBLIC ${IE_MAIN_SOURCE_DIR}/include)
add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME})
-add_clang_format_target(${TARGET_NAME}_clang_format FOR_TARGETS ${TARGET_NAME})
# developer package
# install
install(TARGETS ${TARGET_NAME}
- RUNTIME DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
- ARCHIVE DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
+ RUNTIME DESTINATION ${IE_CPACK_RUNTIME_PATH} COMPONENT core
+ ARCHIVE DESTINATION ${IE_CPACK_ARCHIVE_PATH} COMPONENT core
LIBRARY DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core)
auto input_shape = fq->input(0).get_source_output().get_shape();
- std::vector<std::shared_ptr<ngraph::Node> > fq_inputs;
+ ngraph::OutputVector fq_inputs;
for (size_t i = 0; i < fq->inputs().size(); ++i) {
std::shared_ptr<ngraph::Node> fq_input;
fq_input = fq->input(i).get_source_output().get_node_shared_ptr();
fq_input = std::make_shared<ngraph::opset1::Unsqueeze>(fq_input,
opset1::Constant::create(element::i64, Shape{unsqueeze_axes.size()}, unsqueeze_axes));
}
- fq_input = transpose->copy_with_new_args({fq_input, const_order});
+ fq_input = transpose->copy_with_new_inputs({fq_input, const_order});
fq_inputs.push_back(fq_input);
}
- auto new_fq = fq->copy_with_new_args(fq_inputs);
+ auto new_fq = fq->copy_with_new_inputs(fq_inputs);
new_fq->set_friendly_name(fq->get_friendly_name());
ngraph::replace_node(transpose, new_fq);
auto m = std::make_shared<ngraph::pattern::Matcher>(transpose, "PullTransposeThroughFQUp");
this->add_matcher(m, callback, PassProperty::CHANGE_DYNAMIC_STATE);
-}
\ No newline at end of file
+}
return util::normalize_single_value(const_node->get_vector<float16>(), value);
case element::Type_t::f32:
return util::normalize_single_value(const_node->get_vector<float>(), value);
+ case element::Type_t::bf16:
+ return util::normalize_single_value(const_node->get_vector<bfloat16>(), value);
case element::Type_t::f64:
return util::normalize_single_value(const_node->get_vector<double>(), value);
case element::Type_t::i8:
ie_developer_export_targets(${TARGET_NAME})
endif()
+
+ target_link_libraries(${TARGET_NAME} PUBLIC ${NGRAPH_LIBRARIES} inference_engine_transformations)
endfunction()
add_common_target("vpu_common_lib" FALSE)
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/op/op.hpp"
+
+#include <memory>
+
+namespace ngraph { namespace op {
+
+class DynamicShapeResolver : public Op {
+public:
+ static constexpr NodeTypeInfo type_info{"DynamicShapeResolver", 1};
+ const NodeTypeInfo& get_type_info() const override { return type_info; }
+
+ DynamicShapeResolver(const Output<Node>& tensorWithData, const Output<Node>& tensorWithDims);
+
+ void validate_and_infer_types() override;
+
+ std::shared_ptr<Node> copy_with_new_args(const NodeVector& new_args) const override;
+
+ bool visit_attributes(ngraph::AttributeVisitor& visitor) override;
+};
+
+} // namespace op
+} // namespace ngraph
--- /dev/null
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/node.hpp>
+#include <ngraph/op/op.hpp>
+
+#include <memory>
+#include <vector>
+
+namespace ngraph {
+namespace op {
+
+class StaticShapeNonZero : public Op {
+public:
+ static constexpr NodeTypeInfo type_info{"StaticShapeNonZero", 1};
+ const NodeTypeInfo& get_type_info() const override { return type_info; }
+
+ explicit StaticShapeNonZero(const Output<ngraph::Node>& input);
+
+ void validate_and_infer_types() override;
+
+ std::shared_ptr<Node> copy_with_new_args(const NodeVector& new_args) const override;
+
+ bool visit_attributes(ngraph::AttributeVisitor& visitor) override;
+};
+} // namespace op
+} // namespace ngraph
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+
+#include <vector>
+#include <memory>
+
+namespace ngraph {
+namespace pass {
+
+class DynamicToStaticShape : public FunctionPass {
+public:
+ DynamicToStaticShape() = default;
+
+ bool run_on_function(std::shared_ptr<ngraph::Function> function) override;
+
+private:
+ bool validateStaticShapes(std::shared_ptr<ngraph::Function> function) const;
+};
+
+} // namespace pass
+} // namespace ngraph
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+
+#include <vector>
+#include <memory>
+
+namespace ngraph {
+namespace pass {
+
+class DynamicToStaticShapeNonZero : public GraphRewrite {
+public:
+ DynamicToStaticShapeNonZero();
+};
+
+} // namespace pass
+} // namespace ngraph
} // namespace details
#define VPU_THROW_FORMAT(...) \
- vpu::details::throwFormat<details::VPUException>(__FILE__, __LINE__, __VA_ARGS__)
+ vpu::details::throwFormat<vpu::details::VPUException>(__FILE__, __LINE__, __VA_ARGS__)
#define VPU_THROW_UNLESS(condition, ...) \
do { \
if (!(condition)) { \
- vpu::details::throwFormat<details::VPUException>(__FILE__, __LINE__, __VA_ARGS__); \
+ vpu::details::throwFormat<vpu::details::VPUException>(__FILE__, __LINE__, __VA_ARGS__); \
} \
} while (false)
#define VPU_THROW_UNSUPPORTED_UNLESS(condition, ...) \
do { \
if (!(condition)) { \
- vpu::details::throwFormat<details::UnsupportedLayerException>(__FILE__, __LINE__, __VA_ARGS__); \
+ vpu::details::throwFormat<vpu::details::UnsupportedLayerException>(__FILE__, __LINE__, __VA_ARGS__); \
} \
} while (false)
InferenceEngine::Layout deviceLayout(InferenceEngine::Layout const& layout,
LayoutPreference const& layoutPreference);
-ie::Blob::Ptr getBlobFP16(const ie::Blob::Ptr& in);
+ie::Blob::Ptr convertBlobFP32toFP16(const ie::Blob::CPtr& in);
ie::Blob::Ptr copyBlob(const ie::Blob::Ptr& original);
ie::Blob::Ptr copyBlob(const ie::Blob::Ptr& in, ie::Layout outLayout, void* ptr = nullptr);
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "vpu/ngraph/operations/dynamic_shape_resolver.hpp"
+
+namespace ngraph { namespace op {
+
+constexpr NodeTypeInfo DynamicShapeResolver::type_info;
+
+DynamicShapeResolver::DynamicShapeResolver(const Output<Node>& tensorWithData, const Output<Node>& tensorWithDims)
+ : Op(OutputVector{tensorWithData, tensorWithDims}) {
+ constructor_validate_and_infer_types();
+}
+
+std::shared_ptr<Node> DynamicShapeResolver::copy_with_new_args(const NodeVector& new_args) const {
+ check_new_args_count(this, new_args);
+ return std::make_shared<DynamicShapeResolver>(new_args.at(0), new_args.at(1));
+}
+
+void DynamicShapeResolver::validate_and_infer_types() {
+ NODE_VALIDATION_CHECK(this, get_input_size() == 2, "(", get_friendly_name(), ") supports only ", 2, " inputs, but ", get_input_size(), " provided");
+ NODE_VALIDATION_CHECK(this, get_input_partial_shape(0).is_static(), "(", get_friendly_name(), ") does not support dynamic shape for data tensor");
+ NODE_VALIDATION_CHECK(this, get_input_partial_shape(1).is_static(), "(", get_friendly_name(), ") does not support dynamic shape for dims tensor");
+
+ const auto& dimsElementType = get_input_element_type(1);
+ NODE_VALIDATION_CHECK(this, dimsElementType.is_integral_number() && dimsElementType.is_static(), "(", get_friendly_name(), ") supports only integral "
+ "number type for dims tensor, but ", dimsElementType, " provided");
+
+ const auto& dataShape = get_input_shape(0);
+ const auto& dimsShape = get_input_shape(1);
+ NODE_VALIDATION_CHECK(this, dimsShape.size() == 1 && dimsShape.front() == dataShape.size(), "(", get_friendly_name(), ") inputs shapes mismatch: first "
+ "input shape = ", dataShape, " second input shape = ", dimsShape, " but ", dataShape, " and ", Shape{dataShape.size()}, " are expected");
+
+ set_output_type(0, get_input_element_type(0), get_input_shape(0));
+}
+
+bool DynamicShapeResolver::visit_attributes(ngraph::AttributeVisitor& visitor) {
+ return true;
+}
+
+} // namespace op
+} // namespace ngraph
--- /dev/null
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "vpu/ngraph/operations/static_shape_nonzero.hpp"
+
+namespace ngraph {
+namespace op {
+
+constexpr NodeTypeInfo StaticShapeNonZero::type_info;
+
+StaticShapeNonZero::StaticShapeNonZero(const Output<Node>& input)
+ : Op({input}) {
+ constructor_validate_and_infer_types();
+}
+
+void StaticShapeNonZero::validate_and_infer_types() {
+ NODE_VALIDATION_CHECK(this, get_input_size() == 1,
+ "StaticShapeNonZero must have only 1 input, provided: ",
+ get_input_size());
+
+ const auto& arg_shape = get_input_partial_shape(0);
+ NODE_VALIDATION_CHECK(this, arg_shape.is_static(),
+ "StaticShapeNonZero doesn't support dynamic input shape");
+
+ const auto& input_et = get_input_element_type(0);
+ NODE_VALIDATION_CHECK(this,
+ input_et.is_integral_number() || input_et.is_real(),
+ "StaticShapeNonZero input data type needs to be a numeric type. Got: ",
+ input_et);
+
+ const auto total_dim_size = Dimension(shape_size(arg_shape.to_shape()));
+ set_output_type(0, element::i64, {arg_shape.rank(), total_dim_size});
+ set_output_type(1, element::i64, {Dimension(2)});
+}
+
+std::shared_ptr<Node> StaticShapeNonZero::copy_with_new_args(
+ const NodeVector& new_args) const {
+ check_new_args_count(this, new_args);
+ return std::make_shared<StaticShapeNonZero>(new_args.at(0));
+}
+
+bool StaticShapeNonZero::visit_attributes(ngraph::AttributeVisitor& visitor) {
+ return true;
+}
+
+} // namespace op
+} // namespace ngraph
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "vpu/ngraph/transformations/dynamic_to_static_shape.hpp"
+
+#include "vpu/ngraph/transformations/dynamic_to_static_shape_nonzero.hpp"
+
+#include <vpu/utils/error.hpp>
+
+namespace ngraph {
+namespace pass {
+
+bool DynamicToStaticShape::run_on_function(std::shared_ptr<ngraph::Function> function) {
+ DynamicToStaticShapeNonZero().run_on_function(function);
+
+ return validateStaticShapes(function);
+}
+
+bool DynamicToStaticShape::validateStaticShapes(std::shared_ptr<ngraph::Function> function) const {
+ function->validate_nodes_and_infer_types();
+
+ for (const auto& node : function->get_ops()) {
+ for (const auto& output : node->get_outputs()) {
+ const auto outputPartialShape = output.get_partial_shape();
+ VPU_THROW_UNLESS(outputPartialShape.is_static(),
+ "DynamicToStaticShape pass: after all the transformations there is "
+ "still dynamism in the network. First met node with dynamic output: "
+ "%s (type: %s)", node->get_friendly_name(), node->get_type_name());
+ return false;
+ }
+ }
+ return true;
+}
+
+} // namespace pass
+} // namespace ngraph
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "vpu/ngraph/transformations/dynamic_to_static_shape_nonzero.hpp"
+
+#include <vpu/ngraph/operations/static_shape_nonzero.hpp>
+#include <vpu/ngraph/operations/dynamic_shape_resolver.hpp>
+
+#include <ngraph/opsets/opset3.hpp>
+
+#include <memory>
+
+namespace ngraph {
+namespace pass {
+
+DynamicToStaticShapeNonZero::DynamicToStaticShapeNonZero() {
+ // We don't set strict_mode when use pattern Matcher,
+ // so we can set any type and shape for input.
+ auto inputWithAnyTypeAndShape = std::make_shared<pattern::op::Label>(
+ element::dynamic, PartialShape{});
+ auto nonZeroPattern = std::make_shared<ngraph::op::NonZero>(inputWithAnyTypeAndShape);
+
+ ngraph::graph_rewrite_callback callback = [](pattern::Matcher& matcher) {
+ const auto nonZero = std::dynamic_pointer_cast<ngraph::opset3::NonZero>(matcher.get_match_root());
+ if (!nonZero) {
+ return false;
+ }
+
+ auto staticShapeNonZero = std::make_shared<ngraph::op::StaticShapeNonZero>(
+ nonZero->input(0).get_source_output());
+ staticShapeNonZero->set_friendly_name(nonZero->get_friendly_name() + "/static_shape");
+
+ auto dynamicShapeResolver = std::make_shared<ngraph::op::DynamicShapeResolver>(
+ staticShapeNonZero->output(0), staticShapeNonZero->output(1));
+ dynamicShapeResolver->set_friendly_name(nonZero->get_friendly_name() + "/resolve_shape");
+
+ ngraph::replace_node(matcher.get_match_root(), dynamicShapeResolver);
+ return true;
+ };
+
+ const auto matcher = std::make_shared<ngraph::pattern::Matcher>(
+ nonZeroPattern, "DynamicToStaticShapeNonZero");
+ this->add_matcher(matcher, callback, PassProperty::CHANGE_DYNAMIC_STATE);
+}
+
+} // namespace pass
+} // namespace ngraph
return layout;
}
-ie::Blob::Ptr getBlobFP16(const ie::Blob::Ptr& in) {
- IE_PROFILING_AUTO_SCOPE(getBlobFP16);
+ie::Blob::Ptr convertBlobFP32toFP16(const ie::Blob::CPtr& in) {
+ IE_PROFILING_AUTO_SCOPE(convertBlobFP32toFP16);
auto inDesc = in->getTensorDesc();
auto precision = inDesc.getPrecision();
- if (precision == ie::Precision::FP16)
- return in;
-
if (precision != ie::Precision::FP32) {
VPU_THROW_EXCEPTION << "Unsupported precision " << precision.name();
}
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-// Define if runtime supports it. MX runtime is compatible, KMB is in WIP state
+// Define if runtime supports it. MX runtime is compatible
#define USE_MANUAL_DMA 1
#if defined (USE_MANUAL_DMA)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-// Define if runtime supports it. MX runtime is compatible, KMB is in WIP state
+// Define if runtime supports it. MX runtime is compatible
#define USE_MANUAL_DMA 1
// Set to 1 if only output is zerroed before kernel execution
#pragma once
-#include <memory>
+#include <vpu/graph_transformer.hpp>
+
+#include <vpu/model/model.hpp>
+#include <vpu/backend/blob_format.hpp>
+#include <ie_layers.h>
+
#include <string>
+#include <memory>
#include <set>
#include <vector>
#include <utility>
-#include <ie_layers.h>
-
-#include <vpu/graph_transformer.hpp>
-#include <vpu/model/model.hpp>
-
namespace vpu {
namespace ie = InferenceEngine;
std::pair<char*, size_t>& blobHeader,
int& numActiveStages);
+ int serializeIOInfoSection(
+ const Model& model,
+ DataUsage dataUsage,
+ BlobSerializer& blobSerializer);
+
+ void serializeConstData(
+ const Model& model,
+ const mv_blob_header& blobHdr,
+ std::vector<char>& blob);
+
+ void serializeConstShapes(
+ const Model& model,
+ const mv_blob_header& blobHdr,
+ std::vector<char>& blob);
+
+ ElfN_Ehdr createElfHeader();
+
void getMetaData(
const Model& model,
const std::vector<ie::CNNLayerPtr>& allLayers,
void parseOneHot(const Model& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs) const;
void parseExpPriorGridGenerator(const Model& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs) const;
void parseExpGenerateProposals(const Model& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs) const;
+ void parseScatterUpdate(const Model& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs) const;
+ void parseExpTopKROIs(const Model& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs) const;
+ void parseNonZero(const Model& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs) const;
+ void parseROIAlign(const Model& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs) const;
//
// Special layers
//
const uint32_t BLOB_MAGIC_NUMBER = 9709;
-const uint32_t BLOB_VERSION_MAJOR = 5;
+const uint32_t BLOB_VERSION_MAJOR = 6;
const uint32_t BLOB_VERSION_MINOR = 0;
} // namespace vpu
* Allocates memory for single data node
*/
bool allocateData(const Data& data);
+ ShapeLocation allocateConstShape(Data& data);
void freeData(const Data& data, DeallocationMode mode = DeallocationMode::JustFree);
void selfCheck();
double cost = std::numeric_limits<double>::max();
};
+//
+// Structs for split
+//
+
+struct Slice {
+ int start;
+ size_t size;
+
+ Slice(int start, size_t size) :
+ start(start),
+ size(size) {}
+};
+
+struct DataSlice {
+ Data data;
+ Slice slice;
+
+ DataSlice(Data data, Slice slice) :
+ data(std::move(data)),
+ slice(slice) {}
+};
+
+using DataSlices = std::vector<DataSlice>;
+
+struct ConvTileSlice {
+ HwConvTileInfo tile;
+ Slice slice;
+
+ ConvTileSlice(HwConvTileInfo tile, Slice slice) :
+ tile(tile),
+ slice(slice) {}
+};
+
void printTo(std::ostream& os, const HwConvTileInfo& convTiles);
void printTo(DotLabel& lbl, const HwConvTileInfo& convTiles);
void printTo(std::ostream& os, const HwPaddingInfo& hwPad);
void printTo(DotLabel& lbl, const HwPaddingInfo& hwPad);
-
-//
-// HwWeightsContent
-//
-
-class HwWeightsContent final : public CalculatedDataContent {
-public:
- HwWeightsContent(
- const DataContent::Ptr& origContent,
- const DataDesc& origWeightsDesc,
- int numInputChannels,
- int channelStartIndex = 0);
-
-protected:
- void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override;
-
-private:
- DataDesc _origWeightsDesc;
- int _numInputChannels = 0;
- int _channelStartIndex = 0;
-};
-
//
// calculateHwBufferSize
//
}
//
-// DefaultSwWeightsContent
-//
-
-class DefaultSwWeightsContent final : public CalculatedDataContent {
-public:
- explicit DefaultSwWeightsContent(const DataContent::Ptr& origContent);
-
-protected:
- void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override;
-};
-
-//
// getOneOfSingleNextStage
//
#pragma once
-#include <memory>
-#include <string>
-#include <functional>
-#include <vector>
-
-#include <ie_data.h>
-#include <ie_blob.h>
-
#include <vpu/model/base.hpp>
#include <vpu/model/edges.hpp>
#include <vpu/model/data_desc.hpp>
+#include <vpu/model/data_contents/data_content.hpp>
#include <vpu/backend/blob_serializer.hpp>
#include <vpu/utils/enums.hpp>
#include <vpu/utils/func_ref.hpp>
+#include <ie_data.h>
+#include <ie_blob.h>
+
+#include <memory>
+#include <string>
+#include <functional>
+#include <vector>
+
namespace vpu {
namespace ie = InferenceEngine;
)
//
-// DataLocation
+// Location
//
//
-// Describes where Data object is located.
+// Describes where particular data or shape is located.
//
// Must be synchronized with MvTensor
-VPU_DECLARE_ENUM(DataLocation,
+VPU_DECLARE_ENUM(Location,
None = 0,
Input = 1,
Output = 2,
DDR,
CMX)
-//
-// DataContent
-//
-
-//
-// Content of the Const Data object.
-//
-
-class DataContent {
-public:
- using Ptr = std::shared_ptr<DataContent>;
-
- virtual ~DataContent();
-
- // TYPED pointer
- template <typename T>
- const T* get() const {
- return static_cast<const T*>(getRaw());
- }
-
- const DataDesc& desc() const {
- return _desc;
- }
-
-private:
- // RAW pointer
- virtual const void* getRaw() const = 0;
-
-private:
- DataDesc _desc;
-
- friend ModelObj;
+struct DataLocation final {
+ Location location;
+ int offset;
};
-//
-// Data content that is calculated on the fly, using lazy calculation:
-//
-// * It performs calculation on the first call and stores it in internal buffer.
-// * Next access will return the pointer to calculated buffer.
-//
-class CalculatedDataContent : public DataContent {
-public:
- CalculatedDataContent() = default;
- explicit CalculatedDataContent(const SmallVector<DataContent::Ptr, 2>& baseContents) : _baseContents(baseContents) {}
-
-private:
- const void* getRaw() const override;
-
- virtual size_t getTempBufSize(const SmallVector<DataContent::Ptr, 2>& baseContents) const;
- virtual void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const = 0;
-
-private:
- mutable SmallVector<DataContent::Ptr, 2> _baseContents;
- mutable std::vector<uint8_t> _temp;
+static constexpr DataLocation defaultDataLocation = {
+ Location::None, 0
};
-DataContent::Ptr ieBlobContent(
- const ie::Blob::Ptr& blob,
- int repeat = 1);
-
-DataContent::Ptr replicateContent(float val, int count);
-DataContent::Ptr replicateContent(const DataContent::Ptr& origContent, int count);
-
-DataContent::Ptr scaleContent(const DataContent::Ptr& origContent, float scale);
+struct ShapeLocation final {
+ Location dimsLocation;
+ int dimsOffset;
+ Location stridesLocation;
+ int stridesOffset;
+};
-// The function scales the major dimension of 4D origContent
-DataContent::Ptr scaledChannelContent(
- const DataContent::Ptr& origContent,
- const DataContent::Ptr& scaleContent);
+static constexpr ShapeLocation defaultShapeLocation = {
+ Location::None, 0, Location::None, 0
+};
//
// DataNode
//
VPU_MODEL_ATTRIBUTE(MemoryType, memReqs, MemoryType::DDR)
- VPU_MODEL_ATTRIBUTE(DataLocation, location, DataLocation::None)
- VPU_MODEL_ATTRIBUTE(int, memoryOffset, 0)
+ VPU_MODEL_ATTRIBUTE(DataLocation, dataLocation, defaultDataLocation)
+ VPU_MODEL_ATTRIBUTE(ShapeLocation, shapeLocation, defaultShapeLocation)
//
// Edges wrappers
void setMemReqs(MemoryType mem);
- void setIOInfo(DataLocation location, int ioBufferOffset);
+ void setIOInfo(Location location, int ioBufferOffset);
- void setAllocationInfo(DataLocation location, int memoryOffset);
+ void setDataAllocationInfo(const DataLocation& dataLocation);
+
+ void setShapeAllocationInfo(const ShapeLocation& shapeLocation);
//
// Backend utilities
//
// Serialize as-is for new MvTensor kernels that can work with ND data.
- // If `newOrder` is not empty, it will be used instead of original and missing dimensions will be set to 1.
- void serializeBuffer(
- BlobSerializer& serializer,
- DimsOrder newOrder = DimsOrder());
+ void serializeBuffer(BlobSerializer& serializer);
void serializeIOInfo(BlobSerializer& serializer) const;
const DataDesc& storedDesc,
const DimValues& storedStrides) const;
- void serializeBufferImpl(
- BlobSerializer& serializer,
- const DataDesc& storedDesc,
- const DimValues& storedStrides) const;
-
private:
inline DataNode() :
_consumerEdges(&StageInputEdge::_posInData),
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/calculated_data_content.hpp>
+
+namespace vpu {
+
+//
+// BatchNormalizationWeightsContent
+//
+
+class BatchNormalizationWeightsContent final : public CalculatedDataContent {
+public:
+ BatchNormalizationWeightsContent(const DataContent::Ptr& origContent, float epsilon);
+
+ size_t byteSize() const override;
+
+protected:
+ void fillTempBuf(void* tempBuf) const override;
+
+private:
+ DataContent::CPtr _origContent;
+ float _epsilon;
+};
+
+//
+// BatchNormalizationBiasesContent
+//
+
+class BatchNormalizationBiasesContent final : public CalculatedDataContent {
+public:
+ BatchNormalizationBiasesContent(const DataContent::Ptr& origContent, const DataContent::Ptr& weightsContent);
+
+ size_t byteSize() const override;
+
+protected:
+ void fillTempBuf(void* tempBuf) const override;
+
+private:
+ DataContent::CPtr _origContent;
+ DataContent::CPtr _weightsContent;
+};
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/data_content.hpp>
+
+#include <vpu/utils/small_vector.hpp>
+#include <vpu/model/data_desc.hpp>
+
+namespace vpu {
+
+//
+// Data content that is calculated on the fly, using lazy calculation:
+//
+// * It performs calculation on the first call and stores it in internal buffer.
+// * Next access will return the pointer to calculated buffer.
+//
+
+class CalculatedDataContent : public DataContent {
+public:
+ CalculatedDataContent() = default;
+
+private:
+ const void* getRaw() const override;
+
+ virtual void fillTempBuf(void *tempBuf) const = 0;
+
+private:
+ mutable std::vector<uint8_t> _temp;
+};
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/calculated_data_content.hpp>
+
+namespace vpu {
+
+//
+// ConvIm2ColWeightsContent
+//
+
+class ConvIm2ColWeightsContent final : public CalculatedDataContent {
+public:
+ explicit ConvIm2ColWeightsContent(const DataContent::Ptr& origContent, DataDesc desc);
+
+ size_t byteSize() const override;
+
+protected:
+ void fillTempBuf(void* tempBuf) const override;
+
+private:
+ DataContent::CPtr _origContent;
+ DataDesc _desc;
+};
+
+//
+// Conv3x3WeightsContent
+//
+
+class Conv3x3WeightsContent final : public CalculatedDataContent {
+public:
+ explicit Conv3x3WeightsContent(const DataContent::Ptr& origContent, DataDesc desc);
+
+ size_t byteSize() const override;
+
+protected:
+ void fillTempBuf(void* tempBuf) const override;
+
+private:
+ DataContent::CPtr _origContent;
+ DataDesc _desc;
+};
+
+//
+// ConvCHWWeightsContent
+//
+
+class ConvCHWWeightsContent final : public CalculatedDataContent {
+public:
+ explicit ConvCHWWeightsContent(const DataContent::Ptr& origContent, DataDesc desc);
+
+ size_t byteSize() const override;
+
+protected:
+ void fillTempBuf(void* tempBuf) const override;
+
+private:
+ DataContent::CPtr _origContent;
+ DataDesc _desc;
+};
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/utils/numeric.hpp>
+
+#include <details/ie_exception.hpp>
+
+#include <memory>
+#include <cstdint>
+
+namespace vpu {
+
+class DataContent {
+public:
+ using Ptr = std::shared_ptr<DataContent>;
+ using CPtr = std::shared_ptr<const DataContent>;
+
+ virtual ~DataContent();
+
+ template<typename T>
+ const T* get() const {
+ return static_cast<const T*>(getRaw());
+ }
+
+ virtual size_t byteSize() const = 0;
+
+private:
+ virtual const void* getRaw() const = 0;
+};
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/calculated_data_content.hpp>
+
+namespace vpu {
+
+//
+// DeconvolutionToConvolutionContent
+//
+
+class DeconvolutionToConvolutionContent final : public CalculatedDataContent {
+public:
+ DeconvolutionToConvolutionContent(const DataContent::Ptr& origContent, const DataDesc& desc);
+
+ size_t byteSize() const override;
+
+protected:
+ void fillTempBuf(void *tempBuf) const override;
+
+private:
+ DataContent::CPtr _origContent;
+ DataDesc _desc;
+};
+
+//
+// DepthDeconvolutionCHWWeightsContent
+//
+
+class DepthDeconvolutionCHWWeightsContent final : public CalculatedDataContent {
+public:
+ DepthDeconvolutionCHWWeightsContent(
+ const DataContent::Ptr& origContent,
+ int KX, int KY, int channels);
+
+ size_t byteSize() const override;
+
+protected:
+ void fillTempBuf(void *tempBuf) const override;
+
+private:
+ DataContent::CPtr _origContent;
+ int _KX;
+ int _KY;
+ int _channels;
+};
+
+//
+// DepthDeconvolutionHWCWeightsContent
+//
+
+class DepthDeconvolutionHWCWeightsContent final : public CalculatedDataContent {
+public:
+ DepthDeconvolutionHWCWeightsContent(
+ const DataContent::Ptr& origContent,
+ int KX, int KY, int channels);
+
+ size_t byteSize() const override;
+
+protected:
+ void fillTempBuf(void *tempBuf) const override;
+
+private:
+ DataContent::CPtr _origContent;
+ int _KX;
+ int _KY;
+ int _channels;
+};
+
+//
+// DeconvolutionWeightsContent
+//
+
+class DeconvolutionWeightsContent final : public CalculatedDataContent {
+public:
+ DeconvolutionWeightsContent(
+ const DataContent::Ptr& origContent,
+ DataDesc desc,
+ int KX, int KY,
+ int IC, int OC);
+
+ size_t byteSize() const override;
+
+protected:
+ void fillTempBuf(void *tempBuf) const override;
+
+private:
+ DataDesc _desc;
+ DataContent::CPtr _origContent;
+ mutable std::vector<fp16_t> _intermBuf;
+ int _KX;
+ int _KY;
+ int _IC;
+ int _OC;
+};
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/calculated_data_content.hpp>
+
+namespace vpu {
+
+class DefaultSwWeightsContent final : public CalculatedDataContent {
+public:
+ DefaultSwWeightsContent(const DataContent::Ptr& origContent, const DataDesc& desc);
+
+ size_t byteSize() const override;
+
+protected:
+ void fillTempBuf(void* tempBuf) const override;
+
+private:
+ DataContent::CPtr _origContent;
+ DataDesc _desc;
+};
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/calculated_data_content.hpp>
+
+#include <vpu/middleend/hw/tiling.hpp>
+
+namespace vpu {
+
+class HwConstData final : public CalculatedDataContent {
+public:
+ HwConstData(
+ const DataContent::Ptr& origContent,
+ const DataDesc& origDesc,
+ const DataDesc& resDesc,
+ const std::map<Dim, Slice> dimSlices);
+
+ size_t byteSize() const override;
+
+protected:
+ void fillTempBuf(void *outBuf) const override;
+
+private:
+ DataContent::CPtr _origContent;
+ DataDesc _origDesc;
+ DataDesc _resDesc;
+ std::map<Dim, Slice> _dimSlices;
+};
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/calculated_data_content.hpp>
+
+namespace vpu {
+
+class HwWeightsContent final : public CalculatedDataContent {
+public:
+ HwWeightsContent(
+ const DataContent::Ptr& origContent,
+ const DataDesc& origWeightsDesc,
+ const DataDesc& resDesc,
+ int numInputChannels,
+ int channelStartIndex = 0);
+
+ size_t byteSize() const override;
+
+protected:
+ void fillTempBuf(void *tempBuf) const override;
+
+private:
+ DataContent::CPtr _origContent;
+ DataDesc _origDesc;
+ DataDesc _resDesc;
+ int _numInputChannels = 0;
+ int _channelStartIndex = 0;
+};
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/data_content.hpp>
+
+#include <vpu/model/data.hpp>
+
+namespace vpu {
+
+class IeBlobContent final : public DataContent {
+public:
+ IeBlobContent(const ie::Blob::CPtr& blob, DataType resultDataType);
+
+ size_t byteSize() const override;
+
+protected:
+ const void* getRaw() const override;
+
+private:
+ DataType _resultDataType;
+ mutable ie::Blob::CPtr _blob;
+ mutable ie::Blob::CPtr _blobFp16;
+};
+
+DataContent::Ptr ieBlobContent(const ie::Blob::CPtr& blob, DataType resultPrecision = DataType::FP16);
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/data_content.hpp>
+
+namespace vpu {
+
+class KernelBinaryContent final : public DataContent {
+public:
+ explicit KernelBinaryContent(const std::string& blob);
+
+ size_t byteSize() const override;
+
+protected:
+ const void* getRaw() const override;
+
+private:
+ std::string _blob;
+};
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/calculated_data_content.hpp>
+
+#include <ie_preprocess.hpp>
+
+namespace vpu {
+
+//
+// MeanImageContent
+//
+
+class MeanImageContent final : public CalculatedDataContent {
+public:
+ MeanImageContent(const ie::PreProcessInfo& info, const DataDesc& desc);
+
+ size_t byteSize() const override;
+
+protected:
+ void fillTempBuf(void *tempBuf) const override;
+
+private:
+ DataDesc _desc;
+ ie::PreProcessInfo _info;
+};
+
+//
+// MeanValueContent
+//
+
+class MeanValueContent final : public CalculatedDataContent {
+public:
+ explicit MeanValueContent(const ie::PreProcessInfo& info);
+
+ size_t byteSize() const override;
+
+protected:
+ void fillTempBuf(void *tempBuf) const override;
+
+private:
+ ie::PreProcessInfo _info;
+};
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/calculated_data_content.hpp>
+
+namespace vpu {
+
+class MergeFullyConnectedContentsByChannels final : public CalculatedDataContent {
+public:
+ MergeFullyConnectedContentsByChannels(const std::vector<DataContent::CPtr> contents,
+ const std::vector<DataDesc> inDescs,
+ const DataDesc& resDesc);
+
+ size_t byteSize() const override;
+
+protected:
+ void fillTempBuf(void *temp) const override;
+
+private:
+ std::vector<DataContent::CPtr> _contents;
+ std::vector<DataDesc> _inDescs;
+ DataDesc _resDesc;
+};
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/data_content.hpp>
+
+namespace vpu {
+
+class MTCNNBlobContent final : public DataContent {
+public:
+ explicit MTCNNBlobContent(std::vector<char> blob);
+
+ size_t byteSize() const override;
+
+protected:
+ const void* getRaw() const override;
+
+private:
+ std::vector<char> _blob;
+};
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/data_content.hpp>
+#include <vpu/model/data_desc.hpp>
+
+#include <ie_blob.h>
+
+namespace vpu {
+
+class PReLUBlobContent final : public DataContent {
+public:
+ PReLUBlobContent(const InferenceEngine::Blob::CPtr& blob, const DataDesc& desc, int repeat);
+
+ size_t byteSize() const override;
+
+protected:
+ const void* getRaw() const override;
+
+private:
+ InferenceEngine::Blob::CPtr _blob;
+ int _repeat = 0;
+ DataDesc _desc;
+
+ mutable InferenceEngine::Blob::CPtr _blobFp16;
+ mutable std::vector<fp16_t> _tempFp16;
+};
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/calculated_data_content.hpp>
+
+namespace vpu {
+
+//
+// PriorBoxContent
+//
+
+class PriorBoxContent final : public CalculatedDataContent {
+public:
+ PriorBoxContent(
+ const DataDesc& inDesc0,
+ const DataDesc& inDesc1,
+ const DataDesc& outDesc,
+ const ie::CNNLayerPtr &layer);
+
+ size_t byteSize() const override;
+
+protected:
+ void fillTempBuf(void *tempBuf) const override;
+
+private:
+ DataDesc _inDesc0;
+ DataDesc _inDesc1;
+ DataDesc _outDesc;
+ ie::CNNLayerPtr _layer;
+};
+
+//
+// PriorBoxClusteredContent
+//
+
+class PriorBoxClusteredContent final : public CalculatedDataContent {
+public:
+ PriorBoxClusteredContent(
+ const DataDesc& inDesc0,
+ const DataDesc& inDesc1,
+ const DataDesc& outDesc,
+ const ie::CNNLayerPtr& layer);
+
+ size_t byteSize() const override;
+
+protected:
+ void fillTempBuf(void *tempBuf) const override;
+
+private:
+ DataDesc _inDesc0;
+ DataDesc _inDesc1;
+ DataDesc _outDesc;
+ ie::CNNLayerPtr _layer;
+};
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/calculated_data_content.hpp>
+
+namespace vpu {
+
+class ReplicatedContent final : public CalculatedDataContent {
+public:
+ ReplicatedContent(float val, int count, const DataDesc& desc);
+
+ ReplicatedContent(DataContent::Ptr origContent, int count, const DataDesc& desc);
+
+ size_t byteSize() const override;
+
+protected:
+ void fillTempBuf(void *tempBuf) const override;
+
+private:
+ DataContent::CPtr _origContent = nullptr;
+ DataDesc _desc;
+ float _factor = 1.0f;
+ int _count = 0;
+};
+
+DataContent::Ptr replicateContent(float val, int count, const DataDesc& desc);
+DataContent::Ptr replicateContent(const DataContent::Ptr& origContent, int count, const DataDesc& desc);
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/calculated_data_content.hpp>
+
+namespace vpu {
+
+class ScaledContent final : public CalculatedDataContent {
+public:
+ ScaledContent(const DataContent::Ptr& origContent, float scale);
+
+ size_t byteSize() const override;
+
+protected:
+ void fillTempBuf(void *tempBuf) const override;
+
+private:
+ DataContent::CPtr _origContent;
+ float _factor = 1.0f;
+};
+
+DataContent::Ptr scaleContent(const DataContent::Ptr& origContent, float scale);
+
+} // namespace vpu
int totalDimSize() const;
+ int dimsByteSize() const { return numDims() * static_cast<int>(sizeof(int32_t)); }
+
//
// DimsOrder
//
Exp = 101,
Floor = 102,
TopK = 104,
+ ScatterUpdate = 103,
ReduceMin = 105,
ExpDetectionOutput = 106, // ExperimentalDetectronDetectionOutput
NonMaxSuppression = 107,
LoopStart = 119,
LoopEnd = 120,
ExpPriorGridGenerator = 121,
+ NonZero = 122,
+ ROIAlign = 123,
ExpGenerateProposals = 124,
+ ExpTopKROIs = 125,
)
//
const DataVector& inputs,
const Data& output);
+ Stage addScatterUpdateStage(
+ const Model& model,
+ const std::string& name,
+ const ie::CNNLayerPtr& layer,
+ const Data& input,
+ const Data& output,
+ const Data& indices,
+ const Data& updates,
+ const Data& axis);
+
Stage addLoopStartStage(
const Model& model,
const std::string& name,
} else if (data->usage() == DataUsage::Temp) {
dataColor = "cyan";
} else if (data->usage() == DataUsage::Intermediate) {
- if (data->location() == DataLocation::BSS) {
+ if (data->dataLocation().location == Location::BSS) {
dataColor = "cyan";
- } else if (data->location() == DataLocation::CMX) {
+ } else if (data->dataLocation().location == Location::CMX) {
dataColor = "magenta";
- } else if (data->location() == DataLocation::Blob) {
+ } else if (data->dataLocation().location == Location::Blob) {
dataColor = "aquamarine";
- } else if (data->location() == DataLocation::Input) {
+ } else if (data->dataLocation().location == Location::Input) {
dataColor = "green";
- } else if (data->location() == DataLocation::Output) {
+ } else if (data->dataLocation().location == Location::Output) {
dataColor = "deepskyblue";
}
}
}
}
lbl.appendPair("memReqs", data->memReqs());
- lbl.appendPair("location", data->location());
- lbl.appendPair("memoryOffset", data->memoryOffset());
+ lbl.appendPair("location", data->dataLocation().location);
+ lbl.appendPair("memoryOffset", data->dataLocation().offset);
if (!data->attrs().empty()) {
lbl.appendPair("extraAttrs", data->attrs());
}
#include <vpu/backend/backend.hpp>
+#include <vpu/parsed_config.hpp>
+#include <vpu/compile_env.hpp>
+#include <vpu/utils/auto_scope.hpp>
+#include <vpu/utils/dot_io.hpp>
+#include <vpu/utils/file_system.hpp>
+#include <vpu/utils/numeric.hpp>
+
+#include <precision_utils.h>
+#include <details/caseless.hpp>
+#include <graph_tools.hpp>
+#include <description_buffer.hpp>
+#include <xml_parse_utils.h>
+
#include <climits>
#include <cstring>
-
#include <string>
#include <memory>
#include <list>
#include <iomanip>
#include <atomic>
-#include <precision_utils.h>
-#include <details/caseless.hpp>
-#include <graph_tools.hpp>
-#include <description_buffer.hpp>
-#include <xml_parse_utils.h>
-
-#include <vpu/parsed_config.hpp>
-#include <vpu/compile_env.hpp>
-#include <vpu/backend/blob_format.hpp>
-#include <vpu/utils/auto_scope.hpp>
-#include <vpu/utils/dot_io.hpp>
-#include <vpu/utils/file_system.hpp>
-#include <vpu/utils/numeric.hpp>
-
namespace vpu {
-void BackEnd::serialize(
+struct ModelStagesStat final {
+ bool hasHwStage;
+ bool hasShaveStage;
+ bool hasDmaStage;
+};
+
+int BackEnd::serializeIOInfoSection(
const Model& model,
- std::vector<char>& blob,
- std::pair<char*, size_t>& blobHeader,
- int& numActiveStages) {
- VPU_PROFILE(serialize);
+ DataUsage dataUsage,
+ BlobSerializer& blobSerializer) {
+ VPU_INTERNAL_CHECK(dataUsage == DataUsage::Input || dataUsage == DataUsage::Output,
+ "serializeIOInfoSection was called with {} usage while only {} and {} usages are supported",
+ dataUsage, DataUsage::Input, DataUsage::Output);
- const auto& env = CompileEnv::get();
+ int datasNumber = 0;
+
+ for (const auto& data : model->datas()) {
+ if (data->usage() != dataUsage) {
+ continue;
+ }
- auto batchSize = model->batchSize();
- auto usedMemory = model->attrs().get<UsedMemory>("usedMemory");
+ if (dataUsage == DataUsage::Input) {
+ VPU_INTERNAL_CHECK(data->producerEdge() == nullptr,
+ "serializeIOInfoSection failed on input data {}. Input must have no producer but actually it has: {} with type {}",
+ data->name(), data->producerEdge()->producer()->name(), data->producerEdge()->producer()->type());
+ VPU_INTERNAL_CHECK(data->numConsumers() != 0,
+ "serializeIOInfoSection failed on input data {}. Input must have at least one consumer but it doesn't ",
+ data->usage());
+ }
- //
- // Remove special stages from the stages list
- //
+ if (dataUsage == DataUsage::Output) {
+ VPU_INTERNAL_CHECK(data->producerEdge() != nullptr,
+ "serializeIOInfoSection failed on output data {}. Output must have any producer but it doesn't",
+ data->usage());
+ }
- bool hasHwStage = false;
- bool hasShaveStage = false;
- bool hasDmaStage = false;
+ VPU_INTERNAL_CHECK(data->parentDataEdge() == nullptr,
+ "serializeIOInfoSection failed on {} with usage {}. IO data must have no parentDatas but it does");
- StageVector execStages;
- execStages.reserve(model->numStages());
+ VPU_INTERNAL_CHECK(!data->attrs().has("ioIdx"),
+ "serializeIOInfoSection failed: IO data {} with usage {} doesn't have ioIdx attribute",
+ data->name(), data->usage());
- for (const auto& stage : model->getStages()) {
- if (stage->category() == StageCategory::Special) {
- continue;
- }
+ data->attrs().set("ioIdx", datasNumber);
- if (stage->category() == StageCategory::HW) {
- hasHwStage = true;
- } else if (stage->category() == StageCategory::SHAVE) {
- hasShaveStage = true;
- } else if (stage->category() == StageCategory::DMA) {
- hasDmaStage = true;
- }
+ data->serializeIOInfo(blobSerializer);
- execStages.emplace_back(stage);
+ ++datasNumber;
}
- numActiveStages = execStages.size();
+ return datasNumber;
+}
- //
- // I/O info sections
- //
+ElfN_Ehdr BackEnd::createElfHeader() {
+ ElfN_Ehdr elfHdr = {};
+ elfHdr.e_ident[0] = 0x7f;
+ elfHdr.e_ident[1] = 'e';
+ elfHdr.e_ident[2] = 'l';
+ elfHdr.e_ident[3] = 'f';
+ for (int i = 4; i < 16; i++) {
+ elfHdr.e_ident[i] = 0;
+ }
+ elfHdr.e_type = 1;
+ elfHdr.e_machine = 2;
+ elfHdr.e_version = 2;
+ elfHdr.e_entry = 0;
+ elfHdr.e_phoff = 0;
+ elfHdr.e_shoff = 0;
+ elfHdr.e_ehsize = 8 * sizeof(elfHdr);
- int numInputs = 0;
- BlobSerializer inputInfoSerializer;
+ return elfHdr;
+}
+
+void BackEnd::serializeConstData(const Model& model, const mv_blob_header& blobHdr, std::vector<char>& blob) {
for (const auto& data : model->datas()) {
- if (data->usage() != DataUsage::Input) {
+ if (data->usage() != DataUsage::Const) {
continue;
}
IE_ASSERT(data->producerEdge() == nullptr);
IE_ASSERT(data->parentDataEdge() == nullptr);
IE_ASSERT(data->numConsumers() != 0);
+ IE_ASSERT(data->dataLocation().location == Location::Blob);
- IE_ASSERT(!data->attrs().has("ioIdx"));
- data->attrs().set("ioIdx", numInputs);
-
- data->serializeIOInfo(inputInfoSerializer);
+ const auto content = data->content();
+ IE_ASSERT(content != nullptr);
- ++numInputs;
+ std::copy_n(content->get<uint8_t>(), content->byteSize(), blob.data() + blobHdr.const_data_section_offset + data->dataLocation().offset);
}
+}
- int numOutputs = 0;
- BlobSerializer outputInfoSerializer;
+void BackEnd::serializeConstShapes(const Model& model, const mv_blob_header& blobHdr, std::vector<char>& blob) {
for (const auto& data : model->datas()) {
- if (data->usage() != DataUsage::Output) {
- continue;
- }
+ const auto serializeToBlob = [&data, &blob, &blobHdr](const BlobSerializer& serializer, int offset) {
+ std::copy_n(serializer.data(), data->desc().numDims() * sizeof(uint32_t), blob.data() + blobHdr.const_data_section_offset + offset);
+ };
- IE_ASSERT(data->producerEdge() != nullptr);
- IE_ASSERT(data->parentDataEdge() == nullptr);
+ const auto dimsOrder = data->desc().dimsOrder();
+ const auto storedPerm = dimsOrder.toPermutation();
- IE_ASSERT(!data->attrs().has("ioIdx"));
- data->attrs().set("ioIdx", numOutputs);
+ const auto shapeLocation = data->shapeLocation();
- data->serializeIOInfo(outputInfoSerializer);
+ if (shapeLocation.dimsLocation == Location::Blob) {
+ BlobSerializer dimsSerializer;
+ const auto dims = data->desc().dims();
- ++numOutputs;
+ for (const auto& d : storedPerm) {
+ dimsSerializer.append(checked_cast<uint32_t>(dims[d]));
+ }
+ serializeToBlob(dimsSerializer, shapeLocation.dimsOffset);
+ }
+
+ if (shapeLocation.stridesLocation == Location::Blob) {
+ BlobSerializer stridesSerializer;
+ const auto strides = data->strides();
+
+ for (const auto& d : storedPerm) {
+ stridesSerializer.append(checked_cast<uint32_t>(strides[d]));
+ }
+ serializeToBlob(stridesSerializer, shapeLocation.stridesOffset);
+ }
}
+}
- //
- // Stages section
- //
+void BackEnd::serialize(
+ const Model& model,
+ std::vector<char>& blob,
+ std::pair<char*, size_t>& blobHeader,
+ int& numActiveStages) {
+ VPU_PROFILE(serialize);
+ const auto& env = CompileEnv::get();
+ BlobSerializer inputInfoSerializer;
+ BlobSerializer outputInfoSerializer;
BlobSerializer stagesSerializer;
+
+ const auto getExecStages = [&model]() {
+ StageVector execStages;
+ execStages.reserve(model->numStages());
+
+ for (const auto& stage : model->getStages()) {
+ if (stage->category() == StageCategory::Special) {
+ continue;
+ }
+
+ execStages.emplace_back(stage);
+ }
+
+ return execStages;
+ };
+
+ const auto getModelStagesStat = [&model]() {
+ ModelStagesStat modelStagesStat{false, false, false};
+
+ for (const auto& stage : model->getStages()) {
+ if (stage->category() == StageCategory::Special) {
+ continue;
+ }
+
+ if (stage->category() == StageCategory::HW) {
+ modelStagesStat.hasHwStage = true;
+ } else if (stage->category() == StageCategory::SHAVE) {
+ modelStagesStat.hasShaveStage = true;
+ } else if (stage->category() == StageCategory::DMA) {
+ modelStagesStat.hasDmaStage = true;
+ }
+ }
+
+ return modelStagesStat;
+ };
+
+ const auto createBlobHeader = [&env, &model, &inputInfoSerializer, &outputInfoSerializer, &stagesSerializer]
+ (int numInputs, int numOutputs, const StageVector& execStages, const ModelStagesStat& modelStagesStat) {
+ const auto batchSize = model->batchSize();
+ const auto usedMemory = model->attrs().get<UsedMemory>("usedMemory");
+
+ const auto hdrSize = alignVal<int>(sizeof(ElfN_Ehdr) + sizeof(mv_blob_header), 64);
+ const auto inputInfoSecSize = alignVal(inputInfoSerializer.size(), 64);
+ const auto outputInfoSecSize = alignVal(outputInfoSerializer.size(), 64);
+ const auto stagesSecSize = alignVal(stagesSerializer.size(), 64);
+ const auto constDataSecSize = alignVal(usedMemory.blob, 64);
+
+ mv_blob_header blobHdr = {};
+ blobHdr.magic_number = BLOB_MAGIC_NUMBER;
+ blobHdr.file_size = checked_cast<uint32_t>(hdrSize + inputInfoSecSize + outputInfoSecSize + stagesSecSize + constDataSecSize);
+ blobHdr.blob_ver_major = BLOB_VERSION_MAJOR;
+ blobHdr.blob_ver_minor = BLOB_VERSION_MINOR;
+ blobHdr.inputs_count = checked_cast<uint32_t>(numInputs);
+ blobHdr.outputs_count = checked_cast<uint32_t>(numOutputs);
+ blobHdr.stages_count = checked_cast<uint32_t>(execStages.size());
+ blobHdr.inputs_size = checked_cast<uint32_t>(usedMemory.input);
+ blobHdr.outputs_size = checked_cast<uint32_t>(usedMemory.output);
+ blobHdr.batch_size = checked_cast<uint32_t>(batchSize);
+ blobHdr.bss_mem_size = checked_cast<uint32_t>(usedMemory.BSS);
+ blobHdr.number_of_cmx_slices = checked_cast<uint32_t>(env.resources.numCMXSlices);
+ blobHdr.number_of_shaves = checked_cast<uint32_t>(env.resources.numSHAVEs);
+ blobHdr.has_hw_stage = checked_cast<uint32_t>(modelStagesStat.hasHwStage);
+ blobHdr.has_shave_stage = checked_cast<uint32_t>(modelStagesStat.hasShaveStage);
+ blobHdr.has_dma_stage = checked_cast<uint32_t>(modelStagesStat.hasDmaStage);
+ blobHdr.input_info_section_offset = checked_cast<uint32_t>(hdrSize);
+ blobHdr.output_info_section_offset = checked_cast<uint32_t>(blobHdr.input_info_section_offset + inputInfoSecSize);
+ blobHdr.stage_section_offset = checked_cast<uint32_t>(blobHdr.output_info_section_offset + outputInfoSecSize);
+ blobHdr.const_data_section_offset = checked_cast<uint32_t>(blobHdr.stage_section_offset + stagesSecSize);
+
+ return blobHdr;
+ };
+
+ const int numInputs = serializeIOInfoSection(model, DataUsage::Input, inputInfoSerializer);
+ const int numOutputs = serializeIOInfoSection(model, DataUsage::Output, outputInfoSerializer);
+
+ const auto& execStages = getExecStages();
+ numActiveStages = checked_cast<int>(execStages.size());
+
for (const auto& stage : execStages) {
stage->serialize(stagesSerializer);
}
- //
- // Elf header
- //
+ const auto modelStagesStat = getModelStagesStat();
- ElfN_Ehdr elfHdr = {};
- elfHdr.e_ident[0] = 0x7f;
- elfHdr.e_ident[1] = 'e';
- elfHdr.e_ident[2] = 'l';
- elfHdr.e_ident[3] = 'f';
- for (int i = 4; i < 16; i++) {
- elfHdr.e_ident[i] = 0;
- }
- elfHdr.e_type = 1;
- elfHdr.e_machine = 2;
- elfHdr.e_version = 2;
- elfHdr.e_entry = 0;
- elfHdr.e_phoff = 0;
- elfHdr.e_shoff = 0;
- elfHdr.e_ehsize = 8 * sizeof(elfHdr);
-
- //
- // Blob header
- //
-
- auto hdrSize = alignVal<int>(sizeof(ElfN_Ehdr) + sizeof(mv_blob_header), 64);
- auto inputInfoSecSize = alignVal(inputInfoSerializer.size(), 64);
- auto outputInfoSecSize = alignVal(outputInfoSerializer.size(), 64);
- auto stagesSecSize = alignVal(stagesSerializer.size(), 64);
- auto constDataSecSize = alignVal(usedMemory.blob, 64);
-
- mv_blob_header blobHdr = {};
- blobHdr.magic_number = BLOB_MAGIC_NUMBER;
- blobHdr.file_size = checked_cast<uint32_t>(hdrSize + inputInfoSecSize + outputInfoSecSize + stagesSecSize + constDataSecSize);
- blobHdr.blob_ver_major = BLOB_VERSION_MAJOR;
- blobHdr.blob_ver_minor = BLOB_VERSION_MINOR;
- blobHdr.inputs_count = checked_cast<uint32_t>(numInputs);
- blobHdr.outputs_count = checked_cast<uint32_t>(numOutputs);
- blobHdr.stages_count = checked_cast<uint32_t>(execStages.size());
- blobHdr.inputs_size = checked_cast<uint32_t>(usedMemory.input);
- blobHdr.outputs_size = checked_cast<uint32_t>(usedMemory.output);
- blobHdr.batch_size = checked_cast<uint32_t>(batchSize);
- blobHdr.bss_mem_size = checked_cast<uint32_t>(usedMemory.BSS);
- blobHdr.number_of_cmx_slices = checked_cast<uint32_t>(env.resources.numCMXSlices);
- blobHdr.number_of_shaves = checked_cast<uint32_t>(env.resources.numSHAVEs);
- blobHdr.has_hw_stage = checked_cast<uint32_t>(hasHwStage);
- blobHdr.has_shave_stage = checked_cast<uint32_t>(hasShaveStage);
- blobHdr.has_dma_stage = checked_cast<uint32_t>(hasDmaStage);
- blobHdr.input_info_section_offset = checked_cast<uint32_t>(hdrSize);
- blobHdr.output_info_section_offset = checked_cast<uint32_t>(blobHdr.input_info_section_offset + inputInfoSecSize);
- blobHdr.stage_section_offset = checked_cast<uint32_t>(blobHdr.output_info_section_offset + outputInfoSecSize);
- blobHdr.const_data_section_offset = checked_cast<uint32_t>(blobHdr.stage_section_offset + stagesSecSize);
-
- //
- // Generate fathom blob
- //
+ const auto elfHdr = createElfHeader();
+ const auto blobHdr = createBlobHeader(numInputs, numOutputs, execStages, modelStagesStat);
blob.clear();
blob.resize(blobHdr.file_size, 0);
std::copy_n(outputInfoSerializer.data(), outputInfoSerializer.size(), blob.data() + blobHdr.output_info_section_offset);
std::copy_n(stagesSerializer.data(), stagesSerializer.size(), blob.data() + blobHdr.stage_section_offset);
- for (const auto& data : model->datas()) {
- if (data->usage() != DataUsage::Const) {
- continue;
- }
-
- IE_ASSERT(data->producerEdge() == nullptr);
- IE_ASSERT(data->parentDataEdge() == nullptr);
- IE_ASSERT(data->numConsumers() != 0);
- IE_ASSERT(data->location() == DataLocation::Blob);
-
- auto content = data->content();
- IE_ASSERT(content != nullptr);
-
- std::copy_n(content->get<uint8_t>(), data->totalByteSize(), blob.data() + blobHdr.const_data_section_offset + data->memoryOffset());
- }
-
- //
- // Blob header spec begin containing elf header and blobHeader
- //
+ serializeConstData(model, blobHdr, blob);
+ serializeConstShapes(model, blobHdr, blob);
blobHeader.first = blob.data();
blobHeader.second = sizeof(ElfN_Ehdr) + sizeof(mv_blob_header);
// Truncate zeros
inputName = inputName.c_str();
- auto dataType = static_cast<DataType>(readFromBlob<uint32_t>(blob, inputInfoSecOffset));
+ auto dataType = readFromBlob<DataType>(blob, inputInfoSecOffset);
auto orderCode = readFromBlob<uint32_t>(blob, inputInfoSecOffset);
auto numDims = readFromBlob<uint32_t>(blob, inputInfoSecOffset);
auto perm = dimsOrder.toPermutation();
IE_ASSERT(perm.size() == numDims);
+ auto dimsLocation = readFromBlob<Location>(blob, inputInfoSecOffset);
+ VPU_THROW_UNLESS(dimsLocation == Location::Blob,
+ "BlobReader error while parsing {} input data: only Blob location for input shape is supported, but {} was given",
+ inputName, dimsLocation);
+ auto dimsOffset = _blobHeader.const_data_section_offset + readFromBlob<uint32_t>(blob, inputInfoSecOffset);
+
+ // Skip strides' location and offset
+ inputInfoSecOffset += 2 * sizeof(uint32_t);
+
DimValues vpuDims;
+
for (int i = 0; i < perm.size(); ++i) {
- vpuDims.set(perm[i], readFromBlob<uint32_t>(blob, inputInfoSecOffset));
+ vpuDims.set(perm[i], readFromBlob<uint32_t>(blob, dimsOffset));
}
- // Skip strides
- inputInfoSecOffset += perm.size() * sizeof(uint32_t);
-
ie::TensorDesc ieDesc = DataDesc(dataType, dimsOrder, vpuDims).toTensorDesc();
ie::Data inputData(inputName, ieDesc);
// Truncate zeros
outputName = outputName.c_str();
- auto dataType = static_cast<DataType>(readFromBlob<uint32_t>(blob, outputInfoSecOffset));
+ auto dataType = readFromBlob<DataType>(blob, outputInfoSecOffset);
auto orderCode = readFromBlob<uint32_t>(blob, outputInfoSecOffset);
auto numDims = readFromBlob<uint32_t>(blob, outputInfoSecOffset);
auto perm = dimsOrder.toPermutation();
IE_ASSERT(perm.size() == numDims);
+ auto dimsLocation = readFromBlob<Location>(blob, outputInfoSecOffset);
+ VPU_THROW_UNLESS(dimsLocation == Location::Blob,
+ "BlobReader error while parsing {} output data: only Blob location for output shape is supported, but {} was given",
+ outputName, dimsLocation);
+ auto dimsOffset = _blobHeader.const_data_section_offset + readFromBlob<uint32_t>(blob, outputInfoSecOffset);
+
+ // Skip strides' location and offset
+ outputInfoSecOffset += 2 * sizeof(uint32_t);
+
DimValues vpuDims;
+
for (int i = 0; i < perm.size(); ++i) {
- vpuDims.set(perm[i], readFromBlob<uint32_t>(blob, outputInfoSecOffset));
+ vpuDims.set(perm[i], readFromBlob<uint32_t>(blob, dimsOffset));
}
- // Skip strides
- outputInfoSecOffset += perm.size() * sizeof(uint32_t);
-
ie::TensorDesc ieDesc = DataDesc(dataType, dimsOrder, vpuDims).toTensorDesc();
ie::Data outputData(outputName, ieDesc);
#include <details/caseless.hpp>
#include <details/ie_cnn_network_iterator.hpp>
#include <cpp/ie_cnn_network.h>
-#include <cnn_network_ngraph_impl.hpp>
#include <graph_tools.hpp>
#include <ngraph/function.hpp>
auto checkForDeprecatedCnn = [&network, &env]() {
return !network.getFunction()
&& !env.config.forceDeprecatedCnnConversion
- && dynamic_cast<const ie::details::CNNNetworkNGraphImpl*>(&network);
+ && !dynamic_cast<const ie::details::CNNNetworkImpl*>(&network);
};
VPU_THROW_UNLESS(!checkForDeprecatedCnn(), "Unexpected CNNNetwork format: it was converted to deprecated format prior plugin's call");
#include "vpu/frontend/frontend.hpp"
#include "vpu/utils/profiling.hpp"
#include "vpu/compile_env.hpp"
+#include "vpu/model/data_contents/ie_blob_content.hpp"
#include "net_pass.h"
{"OneHot", LAYER_PARSER(parseOneHot)},
{"ExperimentalDetectronPriorGridGenerator", LAYER_PARSER(parseExpPriorGridGenerator)},
{"ExperimentalDetectronGenerateProposalsSingleImage", LAYER_PARSER(parseExpGenerateProposals)},
+ {"ScatterUpdate", LAYER_PARSER(parseScatterUpdate)},
+ {"ExperimentalDetectronTopKROIs", LAYER_PARSER(parseExpTopKROIs)},
+ {"StaticShapeNonZero", LAYER_PARSER(parseNonZero)},
+ {"ROIAlign", LAYER_PARSER(parseROIAlign)},
}} {}
ModelPtr FrontEnd::buildInitialModel(ie::ICNNNetwork& network) {
#include <vpu/frontend/frontend.hpp>
+#include <vpu/compile_env.hpp>
+#include <vpu/utils/ie_helpers.hpp>
+#include <vpu/model/data_contents/ie_blob_content.hpp>
+
#include <memory>
#include <algorithm>
#include <set>
#include <map>
#include <string>
-#include <vpu/compile_env.hpp>
-#include <vpu/utils/ie_helpers.hpp>
-
namespace vpu {
void FrontEnd::parseInputAndOutputData(const Model& model) {
const auto vpuData = model->addConstData(
ieData->getName(),
descriptor,
- ieBlobContent(ieBlob));
+ ieBlobContent(ieBlob, descriptor.type()));
// User might ask to return the output from Const layer.
if (const auto vpuOutData = getVpuData(ieData)) {
#include <vpu/frontend/frontend.hpp>
-#include <vector>
-#include <memory>
-#include <string>
+#include <vpu/middleend/sw/utility.hpp>
+#include <vpu/utils/ie_helpers.hpp>
+#include <vpu/compile_env.hpp>
+#include <vpu/model/data_contents/mean_contents.hpp>
#include <details/caseless.hpp>
#include <cpp/ie_cnn_network.h>
#include <precision_utils.h>
#include <ie_parallel.hpp>
-#include <vpu/middleend/sw/utility.hpp>
-#include <vpu/utils/ie_helpers.hpp>
-#include <vpu/compile_env.hpp>
+#include <vector>
+#include <memory>
+#include <string>
namespace vpu {
-namespace {
-
-class MeanImageContent final : public CalculatedDataContent {
-public:
- explicit MeanImageContent(const ie::PreProcessInfo& info) : _info(info) {
- }
-
-protected:
- size_t getTempBufSize(const SmallVector<DataContent::Ptr, 2>&) const override {
- size_t countElem = checked_cast<size_t>(desc().dim(Dim::W) * desc().dim(Dim::H) * desc().dim(Dim::C));
- if (desc().dimsOrder() == DimsOrder::NHWC || desc().dimsOrder() == DimsOrder::HWC) {
- countElem *= 2;
- }
-
- return countElem * sizeof(fp16_t);
- }
-
- void fillTempBuf(const SmallVector<DataContent::Ptr, 2>&, void* tempBuf) const override {
- VPU_PROFILE(MeanImageContent);
-
- const size_t numOfChannel = _info.getNumberOfChannels();
-
- const size_t imagePixels = checked_cast<size_t>(desc().dim(Dim::W) * desc().dim(Dim::H));
- const size_t countElem = checked_cast<size_t>(desc().dim(Dim::W) * desc().dim(Dim::H) * desc().dim(Dim::C));
-
- const auto dstPtr = static_cast<fp16_t*>(tempBuf);
-
- auto dstPtr2 = dstPtr;
- if (desc().dimsOrder() == DimsOrder::NHWC || desc().dimsOrder() == DimsOrder::HWC) {
- dstPtr2 += countElem;
- }
-
- ie::parallel_for(numOfChannel, [=](size_t i) {
- const auto meanDataBlob = _info[i]->meanData;
-
- ie::PrecisionUtils::f32tof16Arrays(
- dstPtr2 + i * imagePixels,
- meanDataBlob->buffer().as<const float*>(),
- imagePixels,
- -1.0f);
- });
-
- if (desc().dimsOrder() == DimsOrder::NHWC || desc().dimsOrder() == DimsOrder::HWC) {
- kchw_to_hwck(dstPtr2, dstPtr, desc());
- }
- }
-
-private:
- ie::PreProcessInfo _info;
-};
-
-class MeanValueContent final : public CalculatedDataContent {
-public:
- explicit MeanValueContent(const ie::PreProcessInfo& info) : _info(info) {
- }
-
-protected:
- size_t getTempBufSize(const SmallVector<DataContent::Ptr, 2>&) const override {
- return _info.getNumberOfChannels() * sizeof(fp16_t);
- }
-
- void fillTempBuf(const SmallVector<DataContent::Ptr, 2>&, void* tempBuf) const override {
- VPU_PROFILE(MeanValueContent);
-
- IE_ASSERT(checked_cast<size_t>(desc().totalDimSize()) == _info.getNumberOfChannels());
-
- const auto dstPtr = static_cast<fp16_t*>(tempBuf);
-
- ie::parallel_for(_info.getNumberOfChannels(), [dstPtr, this](size_t i) {
- dstPtr[i] = ie::PrecisionUtils::f32tof16(-_info[i]->meanValue);
- });
- }
-
-private:
- ie::PreProcessInfo _info;
-};
-
-} // namespace
-
void FrontEnd::addPreProcessStages(const Model& model) {
VPU_PROFILE(addPreProcessStages);
const auto meanImage = model->addConstData(
input->name() + "@mean-image",
input->desc(),
- std::make_shared<MeanImageContent>(preProcess));
+ std::make_shared<MeanImageContent>(preProcess, input->desc()));
const auto newInput = model->duplicateData(
input,
#include "graph_transformer.h"
#include "cnn_network_impl.hpp"
-#include "cnn_network_ngraph_impl.hpp"
namespace vpu {
env.log->trace("Remove const layers");
VPU_LOGGER_SECTION(env.log);
- ie::ICNNNetwork* cnnNetwork = &network;
- if (auto nGraphImpl = dynamic_cast<ie::details::CNNNetworkNGraphImpl*>(&network)) {
- // NGraph implementation cannot be casted to CNNNetworkImpl directly
- cnnNetwork = nGraphImpl->getCNNNetwork().get();
- }
-
- // valid for CNNNetworkImpl only, while there's no API in ICNNNetwork to change network
- if (auto cnnNetworkImpl = dynamic_cast<ie::details::CNNNetworkImpl*>(cnnNetwork)) {
- ie::ConstTransformer(cnnNetworkImpl).fullTrim();
- }
+ ie::ConstTransformer(&network).fullTrim();
}
} // namespace vpu
auto parent = edge->parent();
auto child = edge->child();
- auto memoryOffset = parent->memoryOffset();
+ auto memoryOffset = parent->dataLocation().offset;
if (edge->mode() == SharedDataMode::ROI) {
auto parentStrides = parent->strides();
IE_ASSERT(false) << "Unsupported enum value";
}
- child->setAllocationInfo(parent->location(), memoryOffset);
+ child->setDataAllocationInfo({parent->dataLocation().location, memoryOffset});
updateChildDataAllocation(child, offsetLimitation);
}
auto finalByteSize = data->totalByteSize() * _modelBatchSize;
- data->setIOInfo(DataLocation::Input, alignVal(_inputMemOffset, DATA_ALIGNMENT));
+ data->setIOInfo(Location::Input, alignVal(_inputMemOffset, DATA_ALIGNMENT));
_inputMemOffset = alignVal(_inputMemOffset, DATA_ALIGNMENT) + finalByteSize;
updateChildDataAllocation(data, DDR_MAX_SIZE);
finalByteSize = data->totalByteSize() * _modelBatchSize;
}
- data->setIOInfo(DataLocation::Output, alignVal(_outputMemOffset, DATA_ALIGNMENT));
+ data->setIOInfo(Location::Output, alignVal(_outputMemOffset, DATA_ALIGNMENT));
_outputMemOffset = alignVal(_outputMemOffset, DATA_ALIGNMENT) + finalByteSize;
updateChildDataAllocation(data, DDR_MAX_SIZE);
auto finalByteSize = calcAllocationSize(data);
- data->setAllocationInfo(DataLocation::Blob, _blobMemOffset);
+ data->setDataAllocationInfo({Location::Blob, _blobMemOffset});
_blobMemOffset += finalByteSize;
updateChildDataAllocation(data, DDR_MAX_SIZE);
// Update data allocation info
//
- data->setAllocationInfo(chunk->memType == MemoryType::CMX ? DataLocation::CMX : DataLocation::BSS, chunk->pointer);
+ data->setDataAllocationInfo({chunk->memType == MemoryType::CMX ? Location::CMX : Location::BSS, chunk->pointer});
- auto offsetLimitation = (data->location() == DataLocation::CMX) ? _maxCmxSize : DDR_MAX_SIZE;
+ auto offsetLimitation = (data->dataLocation().location == Location::CMX) ? _maxCmxSize : DDR_MAX_SIZE;
updateChildDataAllocation(data, offsetLimitation);
_memChunksPerData.emplace(data, chunk);
return chunk->memType == memoryType;
}
+ShapeLocation Allocator::allocateConstShape(Data& data) {
+ ShapeLocation shapeLocation;
+
+ shapeLocation.dimsLocation = Location::Blob;
+ shapeLocation.stridesLocation = Location::Blob;
+
+ const auto dimsByteSize = data->desc().dimsByteSize();
+
+ shapeLocation.dimsOffset = _blobMemOffset;
+ _blobMemOffset += dimsByteSize;
+
+ shapeLocation.stridesOffset = _blobMemOffset;
+ _blobMemOffset += dimsByteSize;
+
+ return shapeLocation;
+}
+
void Allocator::freeData(const Data& data, DeallocationMode mode) {
//
// Release the chunk
_memChunksPerData[data] = ddrChunk;
- data->setAllocationInfo(DataLocation::BSS, ddrChunk->pointer);
+ data->setDataAllocationInfo({Location::BSS, ddrChunk->pointer});
updateChildDataAllocation(data, DDR_MAX_SIZE);
break;
#include <vpu/middleend/hw/conv_tiling/hw_stage_tiler.hpp>
+#include <vpu/stages/stub_stage.hpp>
+#include <vpu/stages/mx_stage.hpp>
+#include <vpu/middleend/hw/tiling.hpp>
+#include <vpu/middleend/hw/utility.hpp>
+#include <vpu/utils/attributes_map.hpp>
+#include <vpu/model/data_contents/hw_weights_content.hpp>
+#include <vpu/model/data_contents/ie_blob_content.hpp>
+#include <vpu/model/data_contents/replicated_data_content.hpp>
+#include <vpu/model/data_contents/scaled_content.hpp>
+
#include <precision_utils.h>
#include <memory>
#include <list>
#include <unordered_map>
#include <set>
-#include <vpu/stages/stub_stage.hpp>
-#include <vpu/stages/mx_stage.hpp>
-#include <vpu/middleend/hw/tiling.hpp>
-#include <vpu/middleend/hw/utility.hpp>
-#include <vpu/utils/attributes_map.hpp>
-
namespace vpu {
namespace {
hwScales = _model->addConstData(
_original->name() + "@scales",
DataDesc({maxExtendedOutputDimC}),
- replicateContent(stageOptions.reluScale, maxExtendedOutputDimC));
+ replicateContent(stageOptions.reluScale, maxExtendedOutputDimC, DataDesc{maxExtendedOutputDimC}));
} else {
hwScales = _model->addFakeData();
}
const auto content = std::make_shared<HwWeightsContent>(
io.origWeights->content(),
io.origWeights->desc(),
+ descriptor,
channelTile->numInputChannels,
channelTile->channelStartIndex);
}
}
-//
-// HwWeightsContent
-//
-
-HwWeightsContent::HwWeightsContent(const DataContent::Ptr& origContent,
- const DataDesc& origWeightsDesc,
- int numInputChannels,
- int channelStartIndex) :
- CalculatedDataContent({origContent}),
- _origWeightsDesc(origWeightsDesc),
- _numInputChannels(numInputChannels),
- _channelStartIndex(channelStartIndex) {
-}
-
-void HwWeightsContent::fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const {
- VPU_PROFILE(HwWeightsContent);
-
- IE_ASSERT(desc().type() == DataType::FP16);
- IE_ASSERT(baseContents.size() == 1);
-
- auto KX = _origWeightsDesc.dim(Dim::W);
- auto KY = _origWeightsDesc.dim(Dim::H);
- auto IC = _origWeightsDesc.dim(Dim::C);
- auto OC = _origWeightsDesc.dim(Dim::N);
- auto origTotalSize = _origWeightsDesc.totalDimSize();
-
- auto HW_OC_inner = desc().dim(Dim::W);
- auto HW_OC_outer = desc().dim(Dim::N);
- IE_ASSERT(HW_OC_outer * HW_OC_inner >= OC);
-
- auto HW_K = desc().dim(Dim::H);
- IE_ASSERT(HW_K == KX * KY);
-
- IE_ASSERT(_channelStartIndex < IC);
- auto HW_IC = desc().dim(Dim::C);
- auto HW_IC_real = std::min(_numInputChannels, IC - _channelStartIndex);
-
- auto srcData = baseContents[0]->get<fp16_t>();
- IE_ASSERT(srcData != nullptr);
-
- auto dstData = static_cast<fp16_t*>(tempBuf);
-
- IE_ASSERT((_channelStartIndex + HW_IC_real) * HW_K + (OC - 1) * HW_K * IC - 1 < origTotalSize);
- IE_ASSERT((OC - 1) % HW_OC_inner +
- (HW_K - 1) * HW_OC_inner +
- (HW_IC_real - 1) * HW_OC_inner * HW_K +
- ((OC - 1) / 8) * HW_OC_inner * HW_K * HW_IC < desc().totalDimSize());
-
- if (KX == 1 && KY == 1) {
- ie::parallel_for(OC, [=](int oc) {
- auto oc_inner = oc % HW_OC_inner;
- auto oc_outer = oc / HW_OC_inner;
- for (int ic = 0; ic < HW_IC_real; ++ic) {
- auto srcInd =
- (_channelStartIndex + ic) +
- oc * IC;
- auto dstInd =
- oc_inner +
- ic * HW_OC_inner * HW_K +
- oc_outer * HW_OC_inner * HW_K * HW_IC;
-
- dstData[dstInd] = srcData[srcInd];
- }
- });
- } else {
- ie::parallel_for(OC, [=](int oc) {
- auto oc_inner = oc % HW_OC_inner;
- auto oc_outer = oc / HW_OC_inner;
- for (int ic = 0; ic < HW_IC_real; ++ic) {
- for (int ky = 0; ky < KY; ++ky) {
- for (int kx = 0; kx < KX; ++kx) {
- auto srcInd =
- (kx + ky * KX) +
- (_channelStartIndex + ic) * HW_K +
- oc * HW_K * IC;
- auto dstInd =
- oc_inner +
- (ky * KX + kx) * HW_OC_inner +
- ic * HW_OC_inner * HW_K +
- oc_outer * HW_OC_inner * HW_K * HW_IC;
-
- dstData[dstInd] = srcData[srcInd];
- }
- }
- }
- });
- }
-}
-
int calculateHwBufferSize(const DimValues& dims, const DimsOrder& order) {
const auto desc = DataDesc{DataType::FP16, order.empty() ? DimsOrder::fromNumDims(dims.size()) : order, dims};
IE_ASSERT(desc.numDims() > 2 || desc.dimsOrder() == DimsOrder::NC);
#include "vpu/stages/iteration_rule.hpp"
#include "vpu/middleend/pass_manager.hpp"
+#include "vpu/model/data_contents/replicated_data_content.hpp"
#include <utility>
#include <string>
auto inputEdge = stage->inputEdge(0);
auto input = inputEdge->input();
- IE_ASSERT(input->location() != DataLocation::None);
+ IE_ASSERT(input->dataLocation().location != Location::None);
- if (input->memoryOffset() % 16 != 0) {
+ if (input->dataLocation().offset % 16 != 0) {
env.log->trace("HW Stage [%s] input [%s]", stage->name(), input->name());
auto newInput = model->duplicateData(
}
}
+ //
+ // Allocate shape for all datas
+ //
+
+ for (auto data : model->datas()) {
+ const auto shapeLocation = allocator.allocateConstShape(data);
+ data->setShapeAllocationInfo(shapeLocation);
+ }
+
return AllocationResult();
}
#include <vpu/middleend/pass_manager.hpp>
-#include <vector>
-#include <memory>
+#include <vpu/middleend/sw/utility.hpp>
+#include <vpu/model/data_contents/ie_blob_content.hpp>
#include <blob_factory.hpp>
-#include <vpu/middleend/sw/utility.hpp>
+#include <vector>
+#include <memory>
namespace vpu {
});
if (memoryType == MemoryType::CMX) {
- IE_ASSERT(topParent->location() == DataLocation::CMX);
+ IE_ASSERT(topParent->dataLocation().location == Location::CMX);
}
//
#include <vpu/middleend/pass_manager.hpp>
-#include <precision_utils.h>
-#include <ie_parallel.hpp>
-
#include <vpu/compile_env.hpp>
#include <vpu/stages/stub_stage.hpp>
#include <vpu/stages/mx_stage.hpp>
#include <vpu/middleend/hw/utility.hpp>
#include <vpu/middleend/hw/conv_tiling/hw_convolution_tiler.hpp>
#include <vpu/middleend/hw/conv_tiling/hw_stage_tiler.hpp>
+#include <vpu/model/data_contents/hw_const_data_content.hpp>
+
+#include <precision_utils.h>
+#include <ie_parallel.hpp>
#include <utility>
#include <memory>
namespace {
-struct Slice {
- int start;
- size_t size;
-
- Slice(int start, size_t size) :
- start(start),
- size(size) {}
-};
-
-struct DataSlice {
- Data data;
- Slice slice;
-
- DataSlice(Data data, Slice slice) :
- data(std::move(data)),
- slice(slice) {}
-};
-
-using DataSlices = std::vector<DataSlice>;
-
-struct ConvTileSlice {
- HwConvTileInfo tile;
- Slice slice;
-
- ConvTileSlice(HwConvTileInfo tile, Slice slice) :
- tile(tile),
- slice(slice) {}
-};
-
class PassImpl final : public Pass {
public:
explicit PassImpl(StageBuilder::Ptr stageBuilder) : _stageBuilder(std::move(stageBuilder)) {}
if (infoData1 != infoData2)
return infoData1 < infoData2;
- const auto size = data1->content()->desc().totalDimSize();
+ const auto size = data1->content()->byteSize() / sizeof(fp16_t);
const auto content1 = data1->content()->get<fp16_t>();
const auto content2 = data2->content()->get<fp16_t>();
std::map<Data, DataSlices, LexicographicalCompareByData> _splitConstData;
};
-class HwConstData final : public CalculatedDataContent {
-public:
- HwConstData(
- const DataContent::Ptr& origContent,
- const DataDesc& origDesc,
- const std::map<Dim, Slice> dimSlices) :
- CalculatedDataContent({origContent}),
- _origDesc(origDesc),
- _dimSlices(dimSlices) {}
-
-protected:
- void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* outBuf) const override {
- VPU_PROFILE(HwConstData);
-
- VPU_THROW_UNLESS(
- desc().type() == DataType::FP16,
- "Constant data has %v data type while only %v is supported",
- desc().type(), DataType::FP16);
-
- VPU_THROW_UNLESS(baseContents.size() == 1,
- "Missing source buffer for constant data");
-
- const auto srcData = baseContents[0]->get<fp16_t>();
- auto dstData = static_cast<fp16_t*>(outBuf);
-
- VPU_THROW_UNLESS(srcData != nullptr,
- "Source buffer for constant data has null address");
-
- auto getDimSlice = [this](const Dim dim) {
- auto it = _dimSlices.find(dim);
- if (it != _dimSlices.end()) {
- return it->second;
- }
-
- const int startInd = 0;
- const size_t size = _origDesc.dim(dim);
-
- return Slice(startInd, size);
- };
-
- if (_origDesc.numDims() == 4) {
- Slice slice = getDimSlice(Dim::N);
-
- int startOC = slice.start;
- size_t numOC = slice.size;
-
- const auto IC = _origDesc.dim(Dim::C);
- const auto K = _origDesc.dim(Dim::H);
- const auto V = _origDesc.dim(Dim::W);
-
- const auto kernelStride = V;
- const auto inChannelStride = K * kernelStride;
- const auto outerStride = IC * inChannelStride;
-
- ie::parallel_for(numOC, [=](int oc) {
- const auto ocSlice = oc;
- oc += startOC;
-
- const auto ocInner = oc % V;
- const auto ocOuter = oc / V;
- const auto ocSliceInner = ocSlice % V;
- const auto ocSliceOuter = ocSlice / V;
-
- const auto ocSrc = ocInner + ocOuter * outerStride;
- const auto ocDst = ocSliceInner + ocSliceOuter * outerStride;
-
- for (int ic = 0; ic < IC; ++ic)
- for (int k = 0; k < K; ++k) {
- const auto srcInd = ocSrc +
- k * kernelStride +
- ic * inChannelStride;
- const auto dstInd = ocDst +
- k * kernelStride +
- ic * inChannelStride;
-
- dstData[dstInd] = srcData[srcInd];
- }
- });
- } else if (_origDesc.numDims() == 1) {
- Slice slice = getDimSlice(Dim::C);
-
- std::copy(srcData + slice.start, srcData + slice.start + slice.size, dstData);
- } else {
- THROW_IE_EXCEPTION << "Invalid number of dimensions " << _origDesc.numDims();
- }
- }
-
-private:
- DataDesc _origDesc;
- std::map<Dim, Slice> _dimSlices;
-};
-
void PassImpl::run(const Model& model) {
VPU_PROFILE(hwExtraSplit);
const auto content = std::make_shared<HwConstData>(
weights->content(),
weights->desc(),
+ weightsDesc,
dimSlices);
weightsDesc.setDim(Dim::N, alignVal(numChannels, 8) / vectorSize);
const auto biasesContent = std::make_shared<HwConstData>(
biases->content(),
biases->desc(),
+ newBiasesDesc,
dimSlices);
const auto newBiases = model->duplicateData(biases, postfix, newBiasesDesc, biasesContent);
const auto scalesContent = std::make_shared<HwConstData>(
scales->content(),
scales->desc(),
+ newScalesDesc,
dimSlices);
const auto newScales = model->duplicateData(scales, postfix, newScalesDesc, scalesContent);
#include <vpu/middleend/pass_manager.hpp>
-#include <cmath>
+#include <vpu/compile_env.hpp>
+#include <vpu/stages/stub_stage.hpp>
+#include <vpu/stages/mx_stage.hpp>
+#include <vpu/middleend/hw/tiling.hpp>
+#include <vpu/middleend/hw/utility.hpp>
+#include <vpu/model/data_contents/hw_weights_content.hpp>
+#include <vpu/model/data_contents/ie_blob_content.hpp>
+
+#include <precision_utils.h>
+#include <cmath>
#include <tuple>
#include <vector>
#include <limits>
#include <set>
#include <array>
-#include <precision_utils.h>
-
-#include <vpu/compile_env.hpp>
-#include <vpu/stages/stub_stage.hpp>
-#include <vpu/stages/mx_stage.hpp>
-#include <vpu/middleend/hw/tiling.hpp>
-#include <vpu/middleend/hw/utility.hpp>
-
namespace vpu {
namespace {
const auto& content = std::make_shared<HwWeightsContent>(
origWeights->content(),
+ dataDescriptor,
contentDescriptor,
extendedHWInputDimC);
// SPDX-License-Identifier: Apache-2.0
//
-#include <memory>
-#include <utility>
-#include <vector>
+#include <vpu/middleend/pass_manager.hpp>
+#include <vpu/stages/stub_stage.hpp>
+#include <vpu/model/data_contents/merge_fc_content.hpp>
#include <ie_parallel.hpp>
-#include <vpu/middleend/pass_manager.hpp>
-#include <vpu/stages/stub_stage.hpp>
+#include <memory>
+#include <utility>
+#include <vector>
namespace vpu {
namespace {
-class MergeFullyConnectedContentsByChannels final : public CalculatedDataContent {
-public:
- explicit MergeFullyConnectedContentsByChannels(const SmallVector<DataContent::Ptr, 2>& contents) :
- CalculatedDataContent(contents) {}
-
- void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& contents, void* temp) const override {
- IE_ASSERT(!contents.empty());
- // vpu::DataNode has content and vpu::DataDesc with dimensions' vector
- // content has dimensions's vector as well
- // they can be different so we extract channels number from contents
- const auto dstC = std::accumulate(contents.begin(), contents.end(), 0, [](int reduction, const DataContent::Ptr& content) {
- return reduction + content->desc().dims()[Dim::C];});
-
- for (std::size_t i = 0, dstChannelsOffset = 0; i < contents.size(); ++i) {
- const auto& content = contents[i];
- const auto& srcDesc = content->desc();
-
- const auto& srcDims = srcDesc.dims();
- const auto& elemSize = srcDesc.elemSize();
-
- const auto N = srcDims.get(Dim::N, 1);
- const auto H = srcDims.get(Dim::H, 1);
- const auto W = srcDims.get(Dim::W, 1) * elemSize;
-
- const auto& srcC = srcDims[Dim::C];
-
- const auto src = content->get<uint8_t>();
- auto dst = static_cast<uint8_t*>(temp);
-
- InferenceEngine::parallel_for4d(N, srcC, H, W, [dstChannelsOffset, N, H, W, src, dst, srcC, dstC](int n, int c, int h, int w) {
- const auto& srcc = c;
- const auto& dstc = dstChannelsOffset + c;
-
- const auto& srcOffset = n * H * W * srcC + srcc * H * W + h * W + w;
- const auto& dstOffset = n * H * W * dstC + dstc * H * W + h * W + w;
- dst[dstOffset] = src[srcOffset];
- });
-
- dstChannelsOffset += srcC;
- }
- }
-};
-
DataDesc mergeDescriptors(const DataVector& dataObjects) {
const auto& targetDim = Dim::C;
auto mergedDescriptor = dataObjects.front()->desc();
return model->addFakeData();
}
- std::vector<DataContent::Ptr> contents;
+ std::vector<DataContent::CPtr> contents;
+ std::vector<DataDesc> descs;
for (const auto& data : dataObjects) {
contents.push_back(data->content());
+ descs.push_back(data->desc());
}
- auto content = std::make_shared<MergeFullyConnectedContentsByChannels>(contents);
- return model->duplicateData(dataObjects.front(), "@merge-parallel-fc", mergeDescriptors(dataObjects), content);
+ auto mergedDesc = mergeDescriptors(dataObjects);
+
+ auto content = std::make_shared<MergeFullyConnectedContentsByChannels>(contents, descs, mergedDesc);
+ return model->duplicateData(dataObjects.front(), "@merge-parallel-fc", mergedDesc, content);
}
Data mergeOutputs(const Model& model, const DataVector& dataObjects) {
#include <vpu/middleend/pass_manager.hpp>
+#include <vpu/stages/stub_stage.hpp>
+#include <vpu/middleend/sw/utility.hpp>
+#include <vpu/compile_env.hpp>
+#include <vpu/model/data_contents/deconvolution_contents.hpp>
+
#include <tuple>
#include <vector>
#include <algorithm>
#include <unordered_map>
#include <memory>
-#include <vpu/stages/stub_stage.hpp>
-#include <vpu/middleend/sw/utility.hpp>
-#include <vpu/compile_env.hpp>
-
namespace vpu {
namespace {
}
};
-
-class DeconvolutionToConvolutionContent final : public CalculatedDataContent {
-public:
- DeconvolutionToConvolutionContent(
- const DataContent::Ptr& origContent) :
- CalculatedDataContent({origContent}) {
- }
-
- void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const {
- VPU_PROFILE(DeconvolutionToConvolutionContent);
-
- IE_ASSERT(baseContents.size() == 1);
- IE_ASSERT(desc().type() == DataType::FP16);
-
- deconv_to_conv(baseContents[0]->get<fp16_t>(), static_cast<fp16_t*>(tempBuf), desc());
- }
-};
-
-
class PassImpl final : public Pass {
public:
explicit PassImpl(const StageBuilder::Ptr& stageBuilder) : _stageBuilder(stageBuilder) {}
auto newOutput = model->duplicateData(output, "@upsampleData", newDesc);
auto newWeights = model->duplicateData(weights, "@upsampleData", weights->desc(),
- std::make_shared<DeconvolutionToConvolutionContent>(weights->content()));
+ std::make_shared<DeconvolutionToConvolutionContent>(weights->content(), weights->desc()));
auto upsampleStage = model->addNewStage<UpsamplingStage>(
stage->origLayerName() + "@Upsample",
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <vpu/middleend/pass_manager.hpp>
+#include <vpu/stages/stub_stage.hpp>
+#include <vpu/model/data_contents/priorbox_contents.hpp>
+
+#include <ie_parallel.hpp>
+#include <precision_utils.h>
+
#include <cmath>
#include <algorithm>
#include <vector>
#include <queue>
-#include <ie_parallel.hpp>
-#include <precision_utils.h>
-
-#include <vpu/stages/stub_stage.hpp>
-
namespace vpu {
namespace {
-class PriorBoxContent final : public CalculatedDataContent {
-public:
- PriorBoxContent(
- const DataDesc& inDesc0,
- const DataDesc& inDesc1,
- const DataDesc& outDesc,
- const ie::CNNLayerPtr &layer) :
- _inDesc0(inDesc0), _inDesc1(inDesc1), _outDesc(outDesc),
- _layer(layer) {
- IE_ASSERT(layer != nullptr);
- }
-
-protected:
- void fillTempBuf(const SmallVector<DataContent::Ptr, 2> &, void *tempBuf) const override {
- VPU_PROFILE(PriorBoxContent);
-
- auto tempPtr = static_cast<fp16_t*>(tempBuf);
-
- auto _min_sizes = _layer->GetParamAsFloats("min_size", {});
- auto _max_sizes = _layer->GetParamAsFloats("max_size", {});
- auto aspect_ratios = _layer->GetParamAsFloats("aspect_ratio");
- auto _flip = static_cast<bool>(_layer->GetParamAsInt("flip"));
- auto _clip = static_cast<bool>(_layer->GetParamAsInt("clip"));
- auto _variance = _layer->GetParamAsFloats("variance");
- auto _img_h = _layer->GetParamAsInt("img_h", 0);
- auto _img_w = _layer->GetParamAsInt("img_w", 0);
- auto _step = _layer->GetParamAsFloat("step", 0);
- auto _offset = _layer->GetParamAsFloat("offset", 0);
- auto _scale_all_sizes = static_cast<bool>(_layer->GetParamAsInt("scale_all_sizes", 1));
-
- auto _fixed_sizes = _layer->GetParamAsFloats("fixed_size", {});
- auto _fixed_ratios = _layer->GetParamAsFloats("fixed_ratio", {});
- auto _densitys = _layer->GetParamAsFloats("density", {});
-
- SmallVector<float> _aspect_ratios;
- _aspect_ratios.reserve(aspect_ratios.size() + 1);
-
- _aspect_ratios.push_back(1.0f);
- for (const auto& aspect_ratio : aspect_ratios) {
- bool exist = false;
-
- for (const auto& _aspect_ratio : _aspect_ratios) {
- if (fabsf(aspect_ratio - _aspect_ratio) < 1e-6) {
- exist = true;
- break;
- }
- }
- if (!exist) {
- _aspect_ratios.push_back(aspect_ratio);
- if (_flip) {
- if (isFloatEqual(aspect_ratio, 0.f)) {
- THROW_IE_EXCEPTION << "[VPU] PriorBox has 0.0 aspect ratio param in flip mode, "
- << " possible division by zero";
- }
- _aspect_ratios.push_back(1.0f / aspect_ratio);
- }
- }
- }
-
- int _num_priors;
- if (_scale_all_sizes) {
- _num_priors = static_cast<int>(_aspect_ratios.size() * _min_sizes.size());
- } else {
- _num_priors = static_cast<int>(_aspect_ratios.size() + _min_sizes.size() - 1);
- }
-
- if (!_fixed_sizes.empty()) {
- _num_priors = static_cast<int>(_aspect_ratios.size() * _fixed_sizes.size());
- }
-
- if (!_densitys.empty()) {
- for (const auto& _density : _densitys) {
- if (!_fixed_ratios.empty()) {
- _num_priors += _fixed_ratios.size() * (static_cast<int>(pow(_density, 2)) - 1);
- } else {
- _num_priors += _aspect_ratios.size() * (static_cast<int>(pow(_density, 2)) - 1);
- }
- }
- }
-
- _num_priors += _max_sizes.size();
-
- auto W = _inDesc0.dim(Dim::W);
- auto H = _inDesc0.dim(Dim::H);
- auto IW = _img_w == 0 ? _inDesc1.dim(Dim::W) : _img_w;
- auto IH = _img_h == 0 ? _inDesc1.dim(Dim::H) : _img_h;
- auto IWI = 1.0f / static_cast<float>(IW);
- auto IHI = 1.0f / static_cast<float>(IH);
-
- auto OW = (_outDesc.numDims() >= 4) ? _outDesc.dim(Dim::N) : 1;
- auto OH = _outDesc.dim(Dim::W);
-
- float step_x = 0.0f;
- float step_y = 0.0f;
-
- if (_step == 0) {
- step_x = static_cast<float>(IW) / W;
- step_y = static_cast<float>(IH) / H;
- } else {
- step_x = _step;
- step_y = _step;
- }
-
- auto dst_data = tempPtr;
-
- int dim = H * W * _num_priors * 4;
- float center_x = 0.0f;
- float center_y = 0.0f;
-
- float box_width = 0.0f;
- float box_height = 0.0f;
-
- if (_outDesc.dim(Dim::W) != dim || _outDesc.dim(Dim::H) != 2) {
- THROW_IE_EXCEPTION << "[VPU] PriorBox output have invalid dimension, exptected " << dim << "x2"
- << ", got " << _outDesc.dim(Dim::W) << "x" << _outDesc.dim(Dim::H)
- << ", layer name is: " << _layer->name;
- }
-
- auto max_fp16 = [](const float value, const float min) {
- return ie::PrecisionUtils::f32tof16(value > min ? value : min);
- };
-
- auto min_fp16 = [](const float value, const float max) {
- return ie::PrecisionUtils::f32tof16(value < max ? value : max);
- };
-
- size_t idx = 0;
- for (int h = 0; h < H; ++h) {
- for (int w = 0; w < W; ++w) {
- if (_step == 0) {
- center_x = (static_cast<float>(w) + 0.5f) * step_x;
- center_y = (static_cast<float>(h) + 0.5f) * step_y;
- } else {
- center_x = (_offset + static_cast<float>(w)) * _step;
- center_y = (_offset + static_cast<float>(h)) * _step;
- }
-
- for (size_t s = 0; s < _fixed_sizes.size(); ++s) {
- auto fixed_size_ = static_cast<size_t>(_fixed_sizes[s]);
- box_width = box_height = fixed_size_ * 0.5f;
-
- int density_ = 0;
- int shift = 0;
- if (s < _densitys.size()) {
- density_ = static_cast<size_t>(_densitys[s]);
- shift = static_cast<int>(_fixed_sizes[s] / density_);
- }
-
- if (!_fixed_ratios.empty()) {
- for (const auto& fr : _fixed_ratios) {
- const auto box_width_ratio = _fixed_sizes[s] * 0.5f * std::sqrt(fr);
- const auto box_height_ratio = _fixed_sizes[s] * 0.5f / std::sqrt(fr);
-
- for (size_t r = 0; r < density_; ++r) {
- for (size_t c = 0; c < density_; ++c) {
- const auto center_x_temp = center_x - fixed_size_ / 2 + shift / 2.f + c * shift;
- const auto center_y_temp = center_y - fixed_size_ / 2 + shift / 2.f + r * shift;
-
- dst_data[idx++] = max_fp16((center_x_temp - box_width_ratio) * IWI, 0.f);
- dst_data[idx++] = max_fp16((center_y_temp - box_height_ratio) * IHI, 0.f);
- dst_data[idx++] = min_fp16((center_x_temp + box_width_ratio) * IWI, 1.f);
- dst_data[idx++] = min_fp16((center_y_temp + box_height_ratio) * IHI, 1.f);
- }
- }
- }
- } else {
- if (!_densitys.empty()) {
- for (int r = 0; r < density_; ++r) {
- for (int c = 0; c < density_; ++c) {
- const auto center_x_temp = center_x - fixed_size_ / 2 + shift / 2.f + c * shift;
- const auto center_y_temp = center_y - fixed_size_ / 2 + shift / 2.f + r * shift;
-
- dst_data[idx++] = max_fp16((center_x_temp - box_width) * IWI, 0.f);
- dst_data[idx++] = max_fp16((center_y_temp - box_height) * IHI, 0.f);
- dst_data[idx++] = min_fp16((center_x_temp + box_width) * IWI, 1.f);
- dst_data[idx++] = min_fp16((center_y_temp + box_height) * IHI, 1.f);
- }
- }
- }
- // Rest of priors
- for (const auto& ar : _aspect_ratios) {
- if (fabs(ar - 1.) < 1e-6) {
- continue;
- }
-
- const auto box_width_ratio = _fixed_sizes[s] * 0.5f * std::sqrt(ar);
- const auto box_height_ratio = _fixed_sizes[s] * 0.5f / std::sqrt(ar);
- for (int r = 0; r < density_; ++r) {
- for (int c = 0; c < density_; ++c) {
- const auto center_x_temp = center_x - fixed_size_ / 2 + shift / 2.f + c * shift;
- const auto center_y_temp = center_y - fixed_size_ / 2 + shift / 2.f + r * shift;
-
- dst_data[idx++] = max_fp16((center_x_temp - box_width_ratio) * IWI, 0.f);
- dst_data[idx++] = max_fp16((center_y_temp - box_height_ratio) * IHI, 0.f);
- dst_data[idx++] = min_fp16((center_x_temp + box_width_ratio) * IWI, 1.f);
- dst_data[idx++] = min_fp16((center_y_temp + box_height_ratio) * IHI, 1.f);
- }
- }
- }
- }
- }
-
- for (size_t msIdx = 0; msIdx < _min_sizes.size(); msIdx++) {
- box_width = _min_sizes[msIdx];
- box_height = _min_sizes[msIdx];
-
- dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x - box_width / 2.0f) / IW);
- dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y - box_height / 2.0f) / IH);
- dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x + box_width / 2.0f) / IW);
- dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y + box_height / 2.0f) / IH);
-
- if (_max_sizes.size() > msIdx) {
- box_width = box_height = std::sqrt(_min_sizes[msIdx] * _max_sizes[msIdx]);
-
- dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x - box_width / 2.0f) / IW);
- dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y - box_height / 2.0f) / IH);
- dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x + box_width / 2.0f) / IW);
- dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y + box_height / 2.0f) / IH);
- }
-
- if (_scale_all_sizes || (!_scale_all_sizes && (msIdx == _min_sizes.size() - 1))) {
- size_t sIdx = _scale_all_sizes ? msIdx : 0;
- for (const auto& ar : _aspect_ratios) {
- if (std::fabs(ar - 1.0f) < 1e-6) {
- continue;
- }
-
- box_width = _min_sizes[sIdx] * std::sqrt(ar);
- box_height = _min_sizes[sIdx] / std::sqrt(ar);
-
- dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x - box_width / 2.0f) / IW);
- dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y - box_height / 2.0f) / IH);
- dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x + box_width / 2.0f) / IW);
- dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y + box_height / 2.0f) / IH);
- }
- }
- }
- }
- }
-
- if (_clip) {
- for (int d = 0; d < dim; ++d) {
- dst_data[d] = (std::min)((std::max)(dst_data[d], ie::PrecisionUtils::f32tof16(0.0f)), ie::PrecisionUtils::f32tof16(1.0f));
- }
- }
-
- int channel_size = OH * OW;
-
- dst_data += channel_size;
-
- if (_variance.size() == 1) {
- ie::parallel_for(channel_size, [&](int i) {
- dst_data[i] = ie::PrecisionUtils::f32tof16(_variance[0]);
- });
- } else {
- ie::parallel_for4d(H, W, _num_priors, 4, [&](int h, int w, int i, int j) {
- dst_data[j + 4 * (i + _num_priors * (w + W * h))] = ie::PrecisionUtils::f32tof16(_variance[j]);
- });
- }
- }
-
-private:
- DataDesc _inDesc0;
- DataDesc _inDesc1;
- DataDesc _outDesc;
- ie::CNNLayerPtr _layer;
-};
-
-class PriorBoxClusteredContent final : public CalculatedDataContent {
-public:
- PriorBoxClusteredContent(
- const DataDesc& inDesc0,
- const DataDesc& inDesc1,
- const DataDesc& outDesc,
- const ie::CNNLayerPtr& layer) :
- _inDesc0(inDesc0), _inDesc1(inDesc1), _outDesc(outDesc),
- _layer(layer) {
- IE_ASSERT(layer != nullptr);
- }
-
-protected:
- void fillTempBuf(const SmallVector<DataContent::Ptr, 2>&, void* tempBuf) const override {
- VPU_PROFILE(PriorBoxClusteredContent);
-
- auto tempPtr = static_cast<fp16_t*>(tempBuf);
-
- auto widths_ = _layer->GetParamAsFloats("width");
- auto heights_ = _layer->GetParamAsFloats("height");
- auto clip_ = _layer->GetParamAsInt("clip");
- auto variance_ = _layer->GetParamAsFloats("variance");
- auto img_h_ = _layer->GetParamAsInt("img_h", 0);
- auto img_w_ = _layer->GetParamAsInt("img_w", 0);
- auto step_ = _layer->GetParamAsFloat("step", 0);
- auto step_h_ = _layer->GetParamAsFloat("step_h", 0);
- auto step_w_ = _layer->GetParamAsFloat("step_w", 0);
- auto offset_ = _layer->GetParamAsFloat("offset", 0);
-
- auto num_priors_ = widths_.size();
-
- if (variance_.empty()) {
- variance_.push_back(0.1);
- }
-
- auto layer_width = _inDesc0.dim(Dim::W);
- auto layer_height = _inDesc0.dim(Dim::H);
-
- auto img_width = img_w_ == 0 ? _inDesc1.dim(Dim::W) : img_w_;
- auto img_height = img_h_ == 0 ? _inDesc1.dim(Dim::H) : img_h_;
-
- auto step_w = step_w_ == 0 ? step_ : step_w_;
- auto step_h = step_h_ == 0 ? step_ : step_h_;
- if (step_w == 0 || step_h == 0) {
- step_w = static_cast<float>(img_width) / layer_width;
- step_h = static_cast<float>(img_height) / layer_height;
- }
-
- auto expetected_output_dimx = layer_height * layer_width * num_priors_ * 4;
- if (_outDesc.dim(Dim::W) != expetected_output_dimx || _outDesc.dim(Dim::H) != 2) {
- THROW_IE_EXCEPTION << "PriorBoxClustered output has invalid dimension, exptected " << expetected_output_dimx << "x2"
- << ", got " << _outDesc.dim(Dim::W) << "x" << _outDesc.dim(Dim::H) << ", layer name is: " << _layer->name;
- }
-
- auto offset = _outDesc.dim(Dim::W);
- auto var_size = variance_.size();
-
- auto top_data_0 = tempPtr;
- auto top_data_1 = top_data_0 + offset;
-
- ie::parallel_for2d(layer_height, layer_width, [=](int h, int w) {
- auto center_x = (w + offset_) * step_w;
- auto center_y = (h + offset_) * step_h;
-
- for (int s = 0; s < num_priors_; ++s) {
- auto box_width = widths_[s];
- auto box_height = heights_[s];
-
- auto xmin = (center_x - box_width / 2.0f) / img_width;
- auto ymin = (center_y - box_height / 2.0f) / img_height;
- auto xmax = (center_x + box_width / 2.0f) / img_width;
- auto ymax = (center_y + box_height / 2.0f) / img_height;
-
- if (clip_) {
- xmin = std::min(std::max(xmin, 0.0f), 1.0f);
- ymin = std::min(std::max(ymin, 0.0f), 1.0f);
- xmax = std::min(std::max(xmax, 0.0f), 1.0f);
- ymax = std::min(std::max(ymax, 0.0f), 1.0f);
- }
-
- top_data_0[h * layer_width * num_priors_ * 4 + w * num_priors_ * 4 + s * 4 + 0] = ie::PrecisionUtils::f32tof16(xmin);
- top_data_0[h * layer_width * num_priors_ * 4 + w * num_priors_ * 4 + s * 4 + 1] = ie::PrecisionUtils::f32tof16(ymin);
- top_data_0[h * layer_width * num_priors_ * 4 + w * num_priors_ * 4 + s * 4 + 2] = ie::PrecisionUtils::f32tof16(xmax);
- top_data_0[h * layer_width * num_priors_ * 4 + w * num_priors_ * 4 + s * 4 + 3] = ie::PrecisionUtils::f32tof16(ymax);
-
- for (int j = 0; j < var_size; j++) {
- auto index = h * layer_width * num_priors_ * var_size + w * num_priors_ * var_size + s * var_size + j;
- top_data_1[index] = ie::PrecisionUtils::f32tof16(variance_[j]);
- }
- }
- });
- }
-
-private:
- DataDesc _inDesc0;
- DataDesc _inDesc1;
- DataDesc _outDesc;
- ie::CNNLayerPtr _layer;
-};
-
//
// UnusedDataRemover class deletes data that has no consumers,
// and also recursively deletes all its unused predecessors, including
#include <vpu/middleend/pass_manager.hpp>
#include <vpu/middleend/sw/utility.hpp>
#include <vpu/model/data.hpp>
+#include <vpu/model/data_contents/ie_blob_content.hpp>
#include <precision_utils.h>
#include "vpu/middleend/pass_manager.hpp"
#include "vpu/utils/numeric.hpp"
+#include "vpu/model/data_contents/ie_blob_content.hpp"
+
#include "precision_utils.h"
#include "ie_memcpy.h"
#include <vpu/middleend/pass_manager.hpp>
-#include <vector>
-#include <set>
-#include <memory>
+#include <vpu/utils/numeric.hpp>
+#include <vpu/model/data_contents/ie_blob_content.hpp>
#include <precision_utils.h>
-#include <vpu/utils/numeric.hpp>
+#include <vector>
+#include <set>
+#include <memory>
namespace vpu {
#include <vpu/middleend/pass_manager.hpp>
+#include <vpu/compile_env.hpp>
+#include <vpu/middleend/hw/utility.hpp>
+#include <vpu/model/data_contents/ie_blob_content.hpp>
+
#include <vector>
#include <set>
#include <memory>
#include <array>
-#include <vpu/compile_env.hpp>
-#include <vpu/middleend/hw/utility.hpp>
-
namespace vpu {
namespace {
#include <vpu/middleend/pass_manager.hpp>
+#include <vpu/middleend/hw/tiling.hpp>
+#include <vpu/middleend/hw/utility.hpp>
+#include <vpu/model/data_contents/ie_blob_content.hpp>
+
+#include <precision_utils.h>
+
#include <memory>
#include <array>
#include <string>
#include <tuple>
#include <limits>
-#include <precision_utils.h>
-
-#include <vpu/middleend/hw/tiling.hpp>
-#include <vpu/middleend/hw/utility.hpp>
-
namespace vpu {
namespace {
#include "vpu/stage_builder.hpp"
#include "vpu/utils/numeric.hpp"
#include "precision_utils.h"
+#include "vpu/model/data_contents/ie_blob_content.hpp"
#include <memory>
#include <set>
//
#include <vpu/middleend/pass_manager.hpp>
+#include <vpu/middleend/sw/utility.hpp>
+#include <vpu/model/data_contents/conv_weights_contents.hpp>
+#include <vpu/model/data_contents/default_sw_weights_content.hpp>
+
#include <limits>
#include <vector>
#include <unordered_set>
#include <set>
-#include <vpu/middleend/sw/utility.hpp>
-
#define REFERENCE_CONVOLUTION 0
namespace vpu {
namespace {
-class ConvIm2ColWeightsContent final : public CalculatedDataContent {
-public:
- explicit ConvIm2ColWeightsContent(const DataContent::Ptr& origContent) :
- CalculatedDataContent({origContent}) {
- }
-
-protected:
- void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
- VPU_PROFILE(ConvIm2ColWeightsContent);
- kchw_to_khwc(baseContents[0]->get<fp16_t>(), static_cast<fp16_t*>(tempBuf), desc());
- }
-};
-
-class Conv3x3WeightsContent final : public CalculatedDataContent {
-public:
- explicit Conv3x3WeightsContent(const DataContent::Ptr& origContent) :
- CalculatedDataContent({origContent}) {
- }
-
-protected:
- void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
- VPU_PROFILE(Conv3x3WeightsContent);
- kchw_to_hwkc(baseContents[0]->get<fp16_t>(), static_cast<fp16_t*>(tempBuf), desc());
- }
-};
-
-class ConvCHWWeightsContent final : public CalculatedDataContent {
-public:
- explicit ConvCHWWeightsContent(const DataContent::Ptr& origContent) :
- CalculatedDataContent({origContent}) {
- }
-
-protected:
- void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
- VPU_PROFILE(ConvCHWWeightsContent);
- kchw_to_hwkc(baseContents[0]->get<fp16_t>(), static_cast<fp16_t*>(tempBuf), desc());
- }
-};
-
class ConvStage final : public StageNode {
public:
using StageNode::StageNode;
weights,
"@SW",
newWeightsDesc,
- std::make_shared<DefaultSwWeightsContent>(weights->content()));
+ std::make_shared<DefaultSwWeightsContent>(weights->content(), newWeightsDesc));
weights->attrs().set<Data>("swWeights", swWeights);
}
weights,
"@SW",
newWeightsDesc,
- std::make_shared<DefaultSwWeightsContent>(weights->content()));
+ std::make_shared<DefaultSwWeightsContent>(weights->content(), newWeightsDesc));
} else if (isConv1x1) {
swWeights = model()->duplicateData(
weights,
weights,
"@SW",
newWeightsDesc,
- std::make_shared<Conv3x3WeightsContent>(weights->content()));
+ std::make_shared<Conv3x3WeightsContent>(weights->content(), newWeightsDesc));
} else {
swWeights = model()->duplicateData(
weights,
"@SW",
newWeightsDesc,
- std::make_shared<ConvIm2ColWeightsContent>(weights->content()));
+ std::make_shared<ConvIm2ColWeightsContent>(weights->content(), newWeightsDesc));
double im2ColBufSizeF = static_cast<double>(kernelSizeX) * kernelSizeY *
output->desc().dim(Dim::W) * output->desc().dim(Dim::H) * input->desc().dim(Dim::C)
weights,
"@SW",
newWeightsDesc,
- std::make_shared<ConvCHWWeightsContent>(weights->content()));
+ std::make_shared<ConvCHWWeightsContent>(weights->content(), newWeightsDesc));
}
weights->attrs().set<Data>("swWeights", swWeights);
#include <vpu/middleend/pass_manager.hpp>
+#include <vpu/middleend/sw/utility.hpp>
+#include <vpu/utils/numeric.hpp>
+#include <vpu/model/data_contents/deconvolution_contents.hpp>
+
+#include <ie_parallel.hpp>
+
#include <vector>
#include <string>
#include <memory>
#include <unordered_set>
#include <set>
-#include <ie_parallel.hpp>
-
-#include <vpu/middleend/sw/utility.hpp>
-#include <vpu/utils/numeric.hpp>
-
namespace vpu {
namespace {
-void depthDeconvolutionRelayoutCHW(
- const fp16_t* src, int src_size,
- fp16_t* dst, int dst_size,
- int KX, int KY,
- int channels) {
- ie::parallel_for3d(channels, KY, KX, [=](int c, int ky, int kx) {
- int iidx = c * KX * KY + ky * KX + kx;
- IE_ASSERT(iidx >= 0 && iidx < src_size);
-
- int inv_kx = KX - kx - 1;
- int inv_ky = KY - ky - 1;
- int oidx = c * KX * KY + inv_ky * KX + inv_kx;
- IE_ASSERT(oidx >= 0 && oidx < dst_size);
-
- dst[oidx] = src[iidx];
- });
-}
-
-class DepthDeconvolutionCHWWeightsContent final : public CalculatedDataContent {
-public:
- DepthDeconvolutionCHWWeightsContent(
- const DataContent::Ptr& origContent,
- int KX, int KY, int channels) :
- CalculatedDataContent({origContent}),
- _KX(KX), _KY(KY), _channels(channels) {
- }
-
-protected:
- void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
- VPU_PROFILE(DepthDeconvolutionCHWWeightsContent);
- depthDeconvolutionRelayoutCHW(
- baseContents[0]->get<fp16_t>(), desc().totalDimSize(),
- static_cast<fp16_t*>(tempBuf), desc().totalDimSize(),
- _KX, _KY, _channels);
- }
-
-private:
- int _KX;
- int _KY;
- int _channels;
-};
-
-void depthDeconvolutionRelayoutHWC(
- const fp16_t* src, int src_size,
- fp16_t* dst, int dst_size,
- int KX, int KY,
- int channels) {
- ie::parallel_for3d(channels, KY, KX, [=](int c, int ky, int kx) {
- int iidx = c * KX * KY + ky * KX + kx;
- IE_ASSERT(iidx < src_size);
-
- int inv_kx = KX - kx - 1;
- int inv_ky = KY - ky - 1;
- int oidx = inv_ky * KX * channels + inv_kx * channels + c;
- IE_ASSERT(oidx < dst_size);
-
- dst[oidx] = src[iidx];
- });
-}
-
-class DepthDeconvolutionHWCWeightsContent final : public CalculatedDataContent {
-public:
- DepthDeconvolutionHWCWeightsContent(
- const DataContent::Ptr& origContent,
- int KX, int KY, int channels) :
- CalculatedDataContent({origContent}),
- _KX(KX), _KY(KY), _channels(channels) {
- }
-
-protected:
- void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
- VPU_PROFILE(DepthDeconvolutionHWCWeightsContent);
- depthDeconvolutionRelayoutHWC(
- baseContents[0]->get<fp16_t>(), desc().totalDimSize(),
- static_cast<fp16_t*>(tempBuf), desc().totalDimSize(),
- _KX, _KY, _channels);
- }
-
-private:
- int _KX;
- int _KY;
- int _channels;
-};
-
-void deconvolutionRelayout(
- const fp16_t* src, int src_size,
- fp16_t* dst, int dst_size,
- int KX, int KY,
- int IC, int OC) {
- ie::parallel_for4d(OC, IC, KY, KX, [=](int oc, int ic, int ky, int kx) {
- int iidx = ic * OC * KY * KX
- + oc * KY * KX
- + ky * KX
- + kx;
- IE_ASSERT(iidx >= 0 && iidx < src_size);
-
- int inv_kx = KX - kx - 1;
- int inv_ky = KY - ky - 1;
- int oidx = oc * IC * KY * KX
- + ic * KY * KX
- + inv_ky * KX
- + inv_kx;
- IE_ASSERT(oidx >= 0 && oidx < dst_size);
-
- dst[oidx] = src[iidx];
- });
-}
-
-class DeconvolutionWeightsContent final : public CalculatedDataContent {
-public:
- DeconvolutionWeightsContent(
- const DataContent::Ptr& origContent,
- int KX, int KY,
- int IC, int OC) :
- CalculatedDataContent({origContent}),
- _KX(KX), _KY(KY),
- _IC(IC), _OC(OC) {
- }
-
-protected:
- size_t getTempBufSize(const SmallVector<DataContent::Ptr, 2>&) const override {
- return 2 * desc().totalDimSize() * sizeof(fp16_t);
- }
-
-
- void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
- VPU_PROFILE(DeconvolutionWeightsContent);
-
- auto dstPtr = static_cast<fp16_t*>(tempBuf);
- auto dstPtr2 = dstPtr + desc().totalDimSize();
-
- deconvolutionRelayout(
- baseContents[0]->get<fp16_t>(), desc().totalDimSize(),
- dstPtr2, desc().totalDimSize(),
- _KX, _KY,
- _IC, _OC);
-
- kchw_to_hwkc(dstPtr2, dstPtr, desc());
- }
-
-private:
- int _KX;
- int _KY;
- int _IC;
- int _OC;
-};
-
class DeconvStage final : public StageNode {
public:
using StageNode::StageNode;
newWeightsDesc,
std::make_shared<DeconvolutionWeightsContent>(
weights->content(),
+ newWeightsDesc,
kernelSizeX, kernelSizeY,
input->desc().dim(Dim::C),
output->desc().dim(Dim::C)));
#include <vpu/middleend/pass_manager.hpp>
+#include <vpu/middleend/sw/utility.hpp>
+#include <vpu/model/data_contents/default_sw_weights_content.hpp>
+
#include <vector>
#include <memory>
#include <string>
#include <set>
-#include <vpu/middleend/sw/utility.hpp>
-
namespace vpu {
namespace {
weights,
"@SW",
weights->desc(),
- std::make_shared<DefaultSwWeightsContent>(weights->content()));
+ std::make_shared<DefaultSwWeightsContent>(weights->content(), weights->desc()));
weights->attrs().set<Data>("swWeights", swWeights);
}
#include <vpu/middleend/pass_manager.hpp>
+#include <vpu/utils/numeric.hpp>
+#include <vpu/compile_env.hpp>
+#include <vpu/model/data_contents/replicated_data_content.hpp>
+#include <vpu/model/data_contents/scaled_content.hpp>
+
+#include <details/caseless.hpp>
+#include <precision_utils.h>
+
#include <cmath>
#include <sstream>
#include <list>
#include <set>
-#include <precision_utils.h>
-
-#include <vpu/utils/numeric.hpp>
-#include <vpu/compile_env.hpp>
-
-#include <details/caseless.hpp>
-
namespace vpu {
namespace {
IE_ASSERT(stage->output(0)->desc().dims().has(Dim::C));
const auto outputChannels = stage->output(0)->desc().dims()[Dim::C];
- auto scaleInput = model->addConstData(stage->name() + "@scales", DataDesc{{outputChannels}}, replicateContent(1.0f / scale, outputChannels));
+ auto scaleInput = model->addConstData(stage->name() + "@scales",
+ DataDesc{{outputChannels}},
+ replicateContent(1.0f / scale, outputChannels, DataDesc{outputChannels}));
model->replaceStageInput(stage->inputEdge(SCALES_IDX), scaleInput);
}
namespace vpu {
//
-// DefaultSwWeightsContent
-//
-
-DefaultSwWeightsContent::DefaultSwWeightsContent(const DataContent::Ptr& origContent) :
- CalculatedDataContent({origContent}) {
-}
-
-void DefaultSwWeightsContent::fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const {
- VPU_PROFILE(DefaultSwWeightsContent);
-
- IE_ASSERT(desc().type() == DataType::FP16);
- IE_ASSERT(baseContents.size() == 1);
-
- kchw_to_hwck(baseContents[0]->get<fp16_t>(), static_cast<fp16_t*>(tempBuf), desc());
-}
-
-//
// getOneOfSingleNextStage
//
#include <vpu/model/data.hpp>
+#include <vpu/model/edges.hpp>
+#include <vpu/model/stage.hpp>
+#include <vpu/backend/backend.hpp>
+#include <vpu/utils/ie_helpers.hpp>
+#include <vpu/utils/numeric.hpp>
+#include <vpu/compile_env.hpp>
+
+#include <precision_utils.h>
+#include <ie_parallel.hpp>
+
#include <array>
#include <algorithm>
#include <queue>
#include <set>
#include <utility>
-#include <precision_utils.h>
-#include <ie_parallel.hpp>
-
-#include <vpu/model/edges.hpp>
-#include <vpu/model/stage.hpp>
-#include <vpu/backend/backend.hpp>
-#include <vpu/utils/ie_helpers.hpp>
-#include <vpu/utils/numeric.hpp>
-#include <vpu/compile_env.hpp>
-
namespace vpu {
//
-// DataContent
-//
-
-DataContent::~DataContent() = default;
-
-const void* CalculatedDataContent::getRaw() const {
- if (_temp.empty()) {
- _temp.resize(getTempBufSize(_baseContents));
- fillTempBuf(_baseContents, _temp.data());
- _baseContents.clear();
- }
- return _temp.data();
-}
-
-size_t CalculatedDataContent::getTempBufSize(const SmallVector<DataContent::Ptr, 2>&) const {
- return checked_cast<size_t>(desc().totalDimSize()) *
- checked_cast<size_t>(desc().elemSize());
-}
-
-namespace {
-
-class IeBlobContent final : public DataContent {
-public:
- IeBlobContent(const ie::Blob::Ptr& blob, int repeat) : _blob(blob), _repeat(repeat) {}
-
-protected:
- const void* getRaw() const override {
- if (desc().type() == DataType::FP16) {
- if (_blobFp16 == nullptr) {
- _blobFp16 = getBlobFP16(_blob);
- _blob.reset();
- }
-
- if (_repeat == 1) {
- return _blobFp16->cbuffer();
- } else {
- if (_tempFp16.empty()) {
- VPU_PROFILE(IeBlobContent);
-
- IE_ASSERT(desc().totalDimSize() % _repeat == 0);
-
- auto origNumElems = desc().totalDimSize() / _repeat;
- IE_ASSERT(checked_cast<size_t>(origNumElems) <= _blobFp16->size());
-
- auto origPtr = _blobFp16->cbuffer().as<const fp16_t*>();
- IE_ASSERT(origPtr != nullptr);
-
- _tempFp16.resize(checked_cast<size_t>(desc().totalDimSize()));
-
- ie::parallel_for(_repeat, [this, origPtr, origNumElems](int i) {
- std::copy_n(origPtr, origNumElems, _tempFp16.data() + i * origNumElems);
- });
- }
-
- return _tempFp16.data();
- }
- } else if (desc().type() == DataType::S32) {
- if (_repeat == 1) {
- return _blob->cbuffer();
- } else {
- if (_tempS32.empty()) {
- VPU_PROFILE(IeBlobContent);
-
- IE_ASSERT(desc().totalDimSize() % _repeat == 0);
-
- auto origNumElems = desc().totalDimSize() / _repeat;
- IE_ASSERT(checked_cast<size_t>(origNumElems) <= _blob->size());
-
- auto origPtr = _blob->cbuffer().as<const int32_t*>();
- IE_ASSERT(origPtr != nullptr);
-
- _tempS32.resize(checked_cast<size_t>(desc().totalDimSize()));
-
- ie::parallel_for(_repeat, [this, origPtr, origNumElems](int i) {
- std::copy_n(origPtr, origNumElems, _tempS32.data() + i * origNumElems);
- });
- }
-
- return _tempS32.data();
- }
- } else {
- VPU_THROW_EXCEPTION << "Unsupported data type " << desc().type();
- }
- }
-
-private:
- mutable ie::Blob::Ptr _blob;
- int _repeat = 0;
-
- mutable ie::Blob::Ptr _blobFp16;
- mutable std::vector<fp16_t> _tempFp16;
- mutable std::vector<int32_t> _tempS32;
-};
-
-} // namespace
-
-DataContent::Ptr ieBlobContent(const ie::Blob::Ptr& blob, int repeat) {
- return std::make_shared<IeBlobContent>(blob, repeat);
-}
-
-namespace {
-
-class ReplicatedContent final : public CalculatedDataContent {
-public:
- ReplicatedContent(float val, int count) : _factor{val}, _count(count) {}
-
- ReplicatedContent(DataContent::Ptr origContent, int count) :
- CalculatedDataContent({std::move(origContent)}), _count(count) {
- }
-
-protected:
- size_t getTempBufSize(const SmallVector<DataContent::Ptr, 2>& baseContents) const override {
- if (baseContents.empty()) {
- return checked_cast<size_t>(_count) * sizeof(fp16_t);
- } else {
- IE_ASSERT(baseContents.size() == 1);
- IE_ASSERT(desc().totalDimSize() % _count == 0);
-
- return checked_cast<size_t>(desc().totalDimSize()) * sizeof(fp16_t);
- }
- }
-
- void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
- VPU_PROFILE(ReplicatedContent);
-
- auto dstPtr = static_cast<fp16_t*>(tempBuf);
-
- if (baseContents.empty()) {
- std::fill_n(dstPtr, _count, ie::PrecisionUtils::f32tof16(_factor));
- } else {
- IE_ASSERT(baseContents.size() == 1);
- IE_ASSERT(desc().totalDimSize() % _count == 0);
-
- auto origCount = desc().totalDimSize() / _count;
- auto origPtr = baseContents[0]->get<fp16_t>();
- IE_ASSERT(origPtr != nullptr);
-
- ie::parallel_for(_count, [origPtr, origCount, dstPtr](int i) {
- std::copy_n(origPtr, origCount, dstPtr + i * origCount);
- });
- }
- }
-
-private:
- float _factor = 1.0f;
- int _count = 0;
-};
-
-} // namespace
-
-DataContent::Ptr replicateContent(float val, int count) {
- return std::make_shared<ReplicatedContent>(val, count);
-}
-
-DataContent::Ptr replicateContent(const DataContent::Ptr& origContent, int count) {
- return std::make_shared<ReplicatedContent>(origContent, count);
-}
-
-namespace {
-
-class ScaledContent final : public CalculatedDataContent {
-public:
- ScaledContent(const DataContent::Ptr& origContent, float scale) :
- CalculatedDataContent({origContent}), _factor(scale) {
- }
-
-protected:
- void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
- VPU_PROFILE(ScaledContent);
-
- IE_ASSERT(baseContents.size() == 1);
-
- auto totalSize = desc().totalDimSize();
-
- auto origDesc = baseContents[0]->desc();
- IE_ASSERT(origDesc.type() == DataType::FP16);
- IE_ASSERT(origDesc.totalDimSize() == totalSize);
-
- auto srcPtr = baseContents[0]->get<fp16_t>();
- IE_ASSERT(srcPtr != nullptr);
-
- auto dstPtr = static_cast<fp16_t*>(tempBuf);
-
- ie::parallel_for(totalSize, [this, srcPtr, dstPtr](int i) {
- dstPtr[i] = ie::PrecisionUtils::f32tof16(ie::PrecisionUtils::f16tof32(srcPtr[i]) * _factor);
- });
- }
-
-private:
- float _factor = 1.0f;
-};
-
-} // namespace
-
-DataContent::Ptr scaleContent(const DataContent::Ptr& origContent, float scale) {
- return std::make_shared<ScaledContent>(origContent, scale);
-}
-
-namespace {
-
-class ScaledChannelContent final : public CalculatedDataContent {
-public:
- ScaledChannelContent(
- const DataContent::Ptr& origContent,
- const DataContent::Ptr& scaleContent) :
- CalculatedDataContent({origContent, scaleContent}) {
- }
-
-protected:
- void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
- VPU_PROFILE(ScaledChannelContent);
-
- IE_ASSERT(baseContents.size() == 2);
-
- auto totalSize = desc().totalDimSize();
-
- IE_ASSERT(desc().numDims() == 4 && desc().dimsOrder() == DimsOrder::NCHW);
- auto numN = desc().dim(Dim::N);
- auto numC = desc().dim(Dim::C);
- auto numH = desc().dim(Dim::H);
- auto numW = desc().dim(Dim::W);
-
- auto origDesc = baseContents[0]->desc();
- IE_ASSERT(origDesc.type() == DataType::FP16);
- IE_ASSERT(origDesc.totalDimSize() == totalSize);
- IE_ASSERT(baseContents[1]->desc().totalDimSize() == numN);
-
- auto srcPtr = baseContents[0]->get<fp16_t>();
- IE_ASSERT(srcPtr != nullptr);
-
- auto scale = baseContents[1]->get<fp16_t>();
- IE_ASSERT(scale != nullptr);
-
- auto dstPtr = static_cast<fp16_t*>(tempBuf);
-
- for (int n = 0; n < numN; n++) {
- for (int c = 0; c < numC; c++) {
- for (int h = 0; h < numH; h++) {
- for (int w = 0; w < numW; w++) {
- dstPtr[n * numC * numH * numW + c * numH * numW + h * numW + w] =
- srcPtr[n * numC * numH * numW + c * numH * numW + h * numW + w] * scale[n];
- }
- }
- }
- }
- }
-};
-
-} // namespace
-
-DataContent::Ptr scaledChannelContent(
- const DataContent::Ptr& origContent,
- const DataContent::Ptr& scaleContent) {
- return std::make_shared<ScaledChannelContent>(origContent, scaleContent);
-}
-
-//
// DataNode
//
}
void DataNode::clearAllocation() {
- _location = DataLocation::None;
- _memoryOffset = 0;
+ _dataLocation = defaultDataLocation;
attrs().erase("ioBufferOffset");
}
_memReqs = mem;
}
-void DataNode::setIOInfo(DataLocation location, int ioBufferOffset) {
- IE_ASSERT(_usage == DataUsage::Input || _usage == DataUsage::Output);
+void DataNode::setIOInfo(Location location, int ioBufferOffset) {
+ VPU_INTERNAL_CHECK(_usage == DataUsage::Input || _usage == DataUsage::Output,
+ "Data {} failed: setIOInfo called for non IO data, actual usage is {}",
+ name(), usage());
if (_usage == DataUsage::Input) {
- IE_ASSERT(location == DataLocation::Input);
+ VPU_INTERNAL_CHECK(location == Location::Input,
+ "Input data {} failed: setIOInfo called with non input location, actual location is {}",
+ name(), location);
} else if (_usage == DataUsage::Output) {
- IE_ASSERT(location == DataLocation::Output);
+ VPU_INTERNAL_CHECK(location == Location::Output,
+ "Output data {} failed: setIOInfo called with non output location, actual location is {}",
+ name(), location);
}
- _location = location;
- _memoryOffset = 0;
+ _dataLocation = {location, 0};
attrs().set<int>("ioBufferOffset", ioBufferOffset);
}
-void DataNode::setAllocationInfo(DataLocation location, int memoryOffset) {
- IE_ASSERT(_usage == DataUsage::Const || _usage == DataUsage::Intermediate || _usage == DataUsage::Temp);
+void DataNode::setDataAllocationInfo(const DataLocation& dataLocation) {
+ VPU_INTERNAL_CHECK(_usage == DataUsage::Const || _usage == DataUsage::Intermediate || _usage == DataUsage::Temp,
+ "Data {} failed: setDataAllocationInfo called for data with incorrect usage, actual usage: {} "
+ "valid usages: {}, {}, {}", name(), usage(), DataUsage::Const, DataUsage::Intermediate, DataUsage::Temp);
if (_usage == DataUsage::Const) {
- IE_ASSERT(location == DataLocation::Blob);
+ VPU_INTERNAL_CHECK(dataLocation.location == Location::Blob,
+ "Const data {} failed: setDataAllocationInfo called with non blob location, actual location is {}",
+ name(), dataLocation.location);
} else if (_usage == DataUsage::Temp) {
- IE_ASSERT(location == DataLocation::BSS);
+ VPU_INTERNAL_CHECK(dataLocation.location == Location::BSS,
+ "Temp data {} failed: setDataAllocationInfo called with non bss location, actual location is {}",
+ name(), dataLocation.location);
}
- _location = location;
- _memoryOffset = memoryOffset;
+ _dataLocation = dataLocation;
}
-void DataNode::serializeBuffer(
- BlobSerializer& serializer,
- DimsOrder newOrder) {
- if (newOrder.numDims() == 0) {
- serializeBufferImpl(serializer, _desc, this->strides());
- } else {
- IE_ASSERT(newOrder.numDims() >= _desc.dimsOrder().numDims());
-
- auto newDims = _desc.dims();
- auto newStrides = this->strides();
- auto newPerm = newOrder.toPermutation();
+void DataNode::setShapeAllocationInfo(const ShapeLocation& shapeLocation) {
+ _shapeLocation = shapeLocation;
+}
- auto origOrder = _desc.dimsOrder();
- auto origPerm = origOrder.toPermutation();
+void DataNode::serializeBuffer(
+ BlobSerializer& serializer) {
+ serializeDescImpl(serializer, _desc, this->strides());
- size_t origPermInd = 0;
- for (size_t i = 0; i < newPerm.size(); i++) {
- auto d = newPerm[i];
+ serializer.append(checked_cast<uint32_t>(_dataLocation.location));
- if (origPermInd < origPerm.size() && origPerm[origPermInd] == d) {
- ++origPermInd;
- continue;
- }
+ if (_dataLocation.location == Location::Input || _dataLocation.location == Location::Output) {
+ auto topParent = getTopParentData();
- newDims.set(d, 1);
- if (i == 0) {
- newStrides.set(d, _desc.elemSize());
- } else {
- newStrides.set(d, newStrides[newPerm[i - 1]] * newDims[newPerm[i - 1]]);
- }
- }
- IE_ASSERT(origPermInd == origPerm.size());
+ auto ioIdx = topParent->attrs().get<int>("ioIdx");
+ serializer.append(checked_cast<uint32_t>(ioIdx));
- DataDesc newDesc(_desc.type(), newOrder, newDims);
- serializeBufferImpl(serializer, newDesc, newStrides);
+ auto parentByteSize = topParent->totalByteSize();
+ serializer.append(checked_cast<uint32_t>(parentByteSize));
}
+
+ serializer.append(checked_cast<uint32_t>(_dataLocation.offset));
}
void DataNode::serializeIOInfo(BlobSerializer& serializer) const {
const DimValues& storedStrides) const {
IE_ASSERT(storedDesc.numDims() <= MAX_DIMS_32);
- const auto& storedDims = storedDesc.dims();
-
auto storedDimsOrder = storedDesc.dimsOrder();
auto storedPerm = storedDimsOrder.toPermutation();
serializer.append(checked_cast<uint32_t>(storedDimsOrder.code()));
serializer.append(checked_cast<uint32_t>(storedPerm.size()));
- for (auto d : storedPerm) {
- serializer.append(checked_cast<uint32_t>(storedDims[d]));
- }
- for (auto d : storedPerm) {
- serializer.append(checked_cast<uint32_t>(storedStrides[d]));
- }
-}
-
-void DataNode::serializeBufferImpl(
- BlobSerializer& serializer,
- const DataDesc& storedDesc,
- const DimValues& storedStrides) const {
- serializeDescImpl(serializer, storedDesc, storedStrides);
- serializer.append(checked_cast<uint32_t>(_location));
-
- if (_location == DataLocation::Input || _location == DataLocation::Output) {
- auto topParent = getTopParentData();
-
- auto ioIdx = topParent->attrs().get<int>("ioIdx");
- serializer.append(checked_cast<uint32_t>(ioIdx));
-
- auto parentByteSize = topParent->totalByteSize();
- serializer.append(checked_cast<uint32_t>(parentByteSize));
- }
+ const auto& shape = shapeLocation();
- serializer.append(checked_cast<uint32_t>(_memoryOffset));
+ serializer.append(checked_cast<uint32_t>(shape.dimsLocation));
+ serializer.append(checked_cast<uint32_t>(shape.dimsOffset));
+ serializer.append(checked_cast<uint32_t>(shape.stridesLocation));
+ serializer.append(checked_cast<uint32_t>(shape.stridesOffset));
}
void printTo(std::ostream& os, const Data& data) {
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/batch_norm_contents.hpp>
+
+#include <vpu/utils/profiling.hpp>
+
+#include <ie_parallel.hpp>
+#include <precision_utils.h>
+
+namespace vpu {
+
+namespace ie = InferenceEngine;
+
+//
+// BatchNormalizationWeightsContent
+//
+
+BatchNormalizationWeightsContent::BatchNormalizationWeightsContent(const DataContent::Ptr& origContent,
+ float epsilon) :
+ _origContent(origContent), _epsilon(epsilon) {}
+
+size_t BatchNormalizationWeightsContent::byteSize() const {
+ return _origContent->byteSize();
+}
+
+void BatchNormalizationWeightsContent::fillTempBuf(void* tempBuf) const {
+ VPU_PROFILE(BatchNormalizationWeightsContent);
+
+ auto srcPtr = _origContent->get<fp16_t>();
+ auto dstPtr = static_cast<fp16_t*>(tempBuf);
+
+ ie::parallel_for(_origContent->byteSize() / sizeof(fp16_t), [this, srcPtr, dstPtr](int i) {
+ float val = ie::PrecisionUtils::f16tof32(srcPtr[i]) + _epsilon;
+ val = 1.0f / std::sqrt(val);
+ dstPtr[i] = ie::PrecisionUtils::f32tof16(val);
+ });
+}
+
+//
+// BatchNormalizationBiasesContent
+//
+
+BatchNormalizationBiasesContent::BatchNormalizationBiasesContent(const DataContent::Ptr& origContent,
+ const DataContent::Ptr& weightsContent) :
+ _origContent(origContent), _weightsContent(weightsContent) {}
+
+size_t BatchNormalizationBiasesContent::byteSize() const {
+ return _origContent->byteSize();
+}
+
+void BatchNormalizationBiasesContent::fillTempBuf(void* tempBuf) const {
+ VPU_PROFILE(BatchNormalizationBiasesContent);
+
+ auto origPtr = _origContent->get<fp16_t>();
+ auto weightsPtr = _weightsContent->get<fp16_t>();
+
+ auto dstPtr = static_cast<fp16_t*>(tempBuf);
+
+ ie::parallel_for(_origContent->byteSize() / sizeof(fp16_t), [origPtr, weightsPtr, dstPtr](int i) {
+ // TODO : need to be extracted from IE layer.
+ float beta = 0.0f;
+
+ auto wVal = ie::PrecisionUtils::f16tof32(weightsPtr[i]);
+ dstPtr[i] = ie::PrecisionUtils::f32tof16(beta - wVal * ie::PrecisionUtils::f16tof32(origPtr[i]));
+ });
+}
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/calculated_data_content.hpp>
+
+namespace vpu {
+
+const void* CalculatedDataContent::getRaw() const {
+ if (_temp.empty()) {
+ _temp.resize(byteSize());
+ fillTempBuf(_temp.data());
+ }
+ return _temp.data();
+}
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/conv_weights_contents.hpp>
+
+#include <vpu/middleend/sw/utility.hpp>
+#include <vpu/utils/profiling.hpp>
+
+namespace vpu {
+
+//
+// ConvIm2ColWeightsContent
+//
+
+ConvIm2ColWeightsContent::ConvIm2ColWeightsContent(const DataContent::Ptr& origContent, DataDesc desc) :
+ _origContent(origContent), _desc(desc) {}
+
+size_t ConvIm2ColWeightsContent::byteSize() const {
+ return checked_cast<size_t>(_desc.totalDimSize()) *
+ checked_cast<size_t>(_desc.elemSize());
+}
+
+void ConvIm2ColWeightsContent::fillTempBuf(void* tempBuf) const {
+ VPU_PROFILE(ConvIm2ColWeightsContent);
+ kchw_to_khwc(_origContent->get<fp16_t>(), static_cast<fp16_t*>(tempBuf), _desc);
+}
+
+//
+// Conv3x3WeightsContent
+//
+
+Conv3x3WeightsContent::Conv3x3WeightsContent(const DataContent::Ptr& origContent, DataDesc desc) :
+ _origContent(origContent), _desc(desc) {
+}
+
+size_t Conv3x3WeightsContent::byteSize() const {
+ return checked_cast<size_t>(_desc.totalDimSize()) *
+ checked_cast<size_t>(_desc.elemSize());
+}
+
+void Conv3x3WeightsContent::fillTempBuf(void* tempBuf) const {
+ VPU_PROFILE(Conv3x3WeightsContent);
+ kchw_to_hwkc(_origContent->get<fp16_t>(), static_cast<fp16_t*>(tempBuf), _desc);
+}
+
+//
+// ConvCHWWeightsContent
+//
+
+ConvCHWWeightsContent::ConvCHWWeightsContent(const DataContent::Ptr& origContent, DataDesc desc) :
+ _origContent(origContent), _desc(desc) {
+}
+
+size_t ConvCHWWeightsContent::byteSize() const {
+ return checked_cast<size_t>(_desc.totalDimSize()) *
+ checked_cast<size_t>(_desc.elemSize());
+}
+
+void ConvCHWWeightsContent::fillTempBuf(void* tempBuf) const {
+ VPU_PROFILE(ConvCHWWeightsContent);
+ kchw_to_hwkc(_origContent->get<fp16_t>(), static_cast<fp16_t*>(tempBuf), _desc);
+}
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/data_content.hpp>
+
+namespace vpu {
+
+DataContent::~DataContent() = default;
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/deconvolution_contents.hpp>
+
+#include <vpu/utils/profiling.hpp>
+#include <vpu/middleend/sw/utility.hpp>
+
+#include <ie_parallel.hpp>
+
+namespace vpu {
+
+//
+// DeconvolutionToConvolutionContent
+//
+
+DeconvolutionToConvolutionContent::DeconvolutionToConvolutionContent(
+ const DataContent::Ptr& origContent, const DataDesc& desc) :
+ _origContent(origContent), _desc(desc) {
+}
+
+size_t DeconvolutionToConvolutionContent::byteSize() const {
+ return checked_cast<size_t>(_desc.totalDimSize()) *
+ checked_cast<size_t>(_desc.elemSize());
+}
+
+void DeconvolutionToConvolutionContent::fillTempBuf(void* tempBuf) const {
+ VPU_PROFILE(DeconvolutionToConvolutionContent);
+
+ IE_ASSERT(_desc.type() == DataType::FP16);
+
+ deconv_to_conv(_origContent->get<fp16_t>(), static_cast<fp16_t*>(tempBuf), _desc);
+}
+
+//
+// DepthDeconvolutionCHWWeightsContent
+//
+
+void depthDeconvolutionRelayoutCHW(
+ const fp16_t* src, int src_size,
+ fp16_t* dst, int dst_size,
+ int KX, int KY,
+ int channels) {
+ ie::parallel_for3d(channels, KY, KX, [=](int c, int ky, int kx) {
+ int iidx = c * KX * KY + ky * KX + kx;
+ IE_ASSERT(iidx >= 0 && iidx < src_size);
+
+ int inv_kx = KX - kx - 1;
+ int inv_ky = KY - ky - 1;
+ int oidx = c * KX * KY + inv_ky * KX + inv_kx;
+ IE_ASSERT(oidx >= 0 && oidx < dst_size);
+
+ dst[oidx] = src[iidx];
+ });
+}
+
+DepthDeconvolutionCHWWeightsContent::DepthDeconvolutionCHWWeightsContent(
+ const DataContent::Ptr& origContent,
+ int KX, int KY, int channels) :
+ _origContent(origContent),
+ _KX(KX), _KY(KY), _channels(channels) {}
+
+void DepthDeconvolutionCHWWeightsContent::fillTempBuf(void* tempBuf) const {
+ VPU_PROFILE(DepthDeconvolutionCHWWeightsContent);
+ depthDeconvolutionRelayoutCHW(
+ _origContent->get<fp16_t>(), _origContent->byteSize() / sizeof(fp16_t),
+ static_cast<fp16_t*>(tempBuf), _origContent->byteSize() / sizeof(fp16_t),
+ _KX, _KY, _channels);
+}
+
+size_t DepthDeconvolutionCHWWeightsContent::byteSize() const {
+ return _origContent->byteSize();
+}
+
+//
+// DepthDeconvolutionHWCWeightsContent
+//
+
+void depthDeconvolutionRelayoutHWC(
+ const fp16_t* src, int src_size,
+ fp16_t* dst, int dst_size,
+ int KX, int KY,
+ int channels) {
+ ie::parallel_for3d(channels, KY, KX, [=](int c, int ky, int kx) {
+ int iidx = c * KX * KY + ky * KX + kx;
+ IE_ASSERT(iidx < src_size);
+
+ int inv_kx = KX - kx - 1;
+ int inv_ky = KY - ky - 1;
+ int oidx = inv_ky * KX * channels + inv_kx * channels + c;
+ IE_ASSERT(oidx < dst_size);
+
+ dst[oidx] = src[iidx];
+ });
+}
+
+DepthDeconvolutionHWCWeightsContent::DepthDeconvolutionHWCWeightsContent(
+ const DataContent::Ptr& origContent,
+ int KX, int KY, int channels) :
+ _origContent(origContent),
+ _KX(KX), _KY(KY), _channels(channels) {
+}
+
+void DepthDeconvolutionHWCWeightsContent::fillTempBuf(void* tempBuf) const {
+ VPU_PROFILE(DepthDeconvolutionHWCWeightsContent);
+ depthDeconvolutionRelayoutHWC(
+ _origContent->get<fp16_t>(), _origContent->byteSize() / sizeof(fp16_t),
+ static_cast<fp16_t*>(tempBuf), _origContent->byteSize() / sizeof(fp16_t),
+ _KX, _KY, _channels);
+}
+
+size_t DepthDeconvolutionHWCWeightsContent::byteSize() const {
+ return _origContent->byteSize();
+}
+
+//
+// DeconvolutionWeightsContent
+//
+
+void deconvolutionRelayout(
+ const fp16_t* src, int src_size,
+ fp16_t* dst, int dst_size,
+ int KX, int KY,
+ int IC, int OC) {
+ ie::parallel_for4d(OC, IC, KY, KX, [=](int oc, int ic, int ky, int kx) {
+ int iidx = ic * OC * KY * KX
+ + oc * KY * KX
+ + ky * KX
+ + kx;
+ IE_ASSERT(iidx >= 0 && iidx < src_size);
+
+ int inv_kx = KX - kx - 1;
+ int inv_ky = KY - ky - 1;
+ int oidx = oc * IC * KY * KX
+ + ic * KY * KX
+ + inv_ky * KX
+ + inv_kx;
+ IE_ASSERT(oidx >= 0 && oidx < dst_size);
+
+ dst[oidx] = src[iidx];
+ });
+}
+
+DeconvolutionWeightsContent::DeconvolutionWeightsContent(
+ const DataContent::Ptr& origContent,
+ DataDesc desc,
+ int KX, int KY,
+ int IC, int OC) :
+ _origContent(origContent), _desc(desc),
+ _intermBuf(_desc.totalDimSize()),
+ _KX(KX), _KY(KY),
+ _IC(IC), _OC(OC) {
+}
+
+size_t DeconvolutionWeightsContent::byteSize() const {
+ return _desc.totalDimSize() * sizeof(fp16_t);
+}
+
+void DeconvolutionWeightsContent::fillTempBuf(void* tempBuf) const {
+ VPU_PROFILE(DeconvolutionWeightsContent);
+
+ auto dstPtr = static_cast<fp16_t*>(tempBuf);
+
+ deconvolutionRelayout(
+ _origContent->get<fp16_t>(), _desc.totalDimSize(),
+ _intermBuf.data(), _desc.totalDimSize(),
+ _KX, _KY,
+ _IC, _OC);
+
+ kchw_to_hwkc(_intermBuf.data(), dstPtr, _desc);
+}
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/default_sw_weights_content.hpp>
+
+#include <vpu/utils/profiling.hpp>
+#include <vpu/middleend/sw/utility.hpp>
+
+namespace vpu {
+
+DefaultSwWeightsContent::DefaultSwWeightsContent(const DataContent::Ptr& origContent, const DataDesc& desc) :
+ _origContent(origContent), _desc(desc) {
+}
+
+size_t DefaultSwWeightsContent::byteSize() const {
+ return checked_cast<size_t>(_desc.totalDimSize()) *
+ checked_cast<size_t>(_desc.elemSize());
+}
+
+void DefaultSwWeightsContent::fillTempBuf(void* tempBuf) const {
+ VPU_PROFILE(DefaultSwWeightsContent);
+
+ IE_ASSERT(_desc.type() == DataType::FP16);
+
+ kchw_to_hwck(_origContent->get<fp16_t>(), static_cast<fp16_t*>(tempBuf), _desc);
+}
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/hw_const_data_content.hpp>
+
+#include <vpu/utils/profiling.hpp>
+
+#include <ie_parallel.hpp>
+
+namespace vpu {
+
+HwConstData::HwConstData(
+ const DataContent::Ptr& origContent,
+ const DataDesc& origDesc,
+ const DataDesc& resDesc,
+ const std::map<Dim, Slice> dimSlices) :
+ _origContent(origContent),
+ _origDesc(origDesc),
+ _resDesc(resDesc),
+ _dimSlices(dimSlices) {}
+
+size_t HwConstData::byteSize() const {
+ return checked_cast<size_t>(_resDesc.totalDimSize()) *
+ checked_cast<size_t>(_resDesc.elemSize());
+}
+
+void HwConstData::fillTempBuf(void* outBuf) const {
+ VPU_PROFILE(HwConstData);
+
+ VPU_THROW_UNLESS(
+ _resDesc.type() == DataType::FP16,
+ "Constant data has {} data type while only {} is supported",
+ _resDesc.type(), DataType::FP16);
+
+ const auto srcData = _origContent->get<fp16_t>();
+ auto dstData = static_cast<fp16_t*>(outBuf);
+
+ VPU_THROW_UNLESS(srcData != nullptr,
+ "Source buffer for constant data has null address");
+
+ auto getDimSlice = [this](const Dim dim) {
+ auto it = _dimSlices.find(dim);
+ if (it != _dimSlices.end()) {
+ return it->second;
+ }
+
+ const int startInd = 0;
+ const size_t size = _origDesc.dim(dim);
+
+ return Slice(startInd, size);
+ };
+
+ if (_origDesc.numDims() == 4) {
+ Slice slice = getDimSlice(Dim::N);
+
+ int startOC = slice.start;
+ size_t numOC = slice.size;
+
+ const auto IC = _origDesc.dim(Dim::C);
+ const auto K = _origDesc.dim(Dim::H);
+ const auto V = _origDesc.dim(Dim::W);
+
+ const auto kernelStride = V;
+ const auto inChannelStride = K * kernelStride;
+ const auto outerStride = IC * inChannelStride;
+
+ ie::parallel_for(numOC, [=](int oc) {
+ const auto ocSlice = oc;
+ oc += startOC;
+
+ const auto ocInner = oc % V;
+ const auto ocOuter = oc / V;
+ const auto ocSliceInner = ocSlice % V;
+ const auto ocSliceOuter = ocSlice / V;
+
+ const auto ocSrc = ocInner + ocOuter * outerStride;
+ const auto ocDst = ocSliceInner + ocSliceOuter * outerStride;
+
+ for (int ic = 0; ic < IC; ++ic)
+ for (int k = 0; k < K; ++k) {
+ const auto srcInd = ocSrc +
+ k * kernelStride +
+ ic * inChannelStride;
+ const auto dstInd = ocDst +
+ k * kernelStride +
+ ic * inChannelStride;
+
+ dstData[dstInd] = srcData[srcInd];
+ }
+ });
+ } else if (_origDesc.numDims() == 1) {
+ Slice slice = getDimSlice(Dim::C);
+
+ std::copy(srcData + slice.start, srcData + slice.start + slice.size, dstData);
+ } else {
+ THROW_IE_EXCEPTION << "Invalid number of dimensions " << _origDesc.numDims();
+ }
+}
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/hw_weights_content.hpp>
+
+#include <vpu/utils/profiling.hpp>
+
+#include <ie_parallel.hpp>
+
+namespace vpu {
+
+HwWeightsContent::HwWeightsContent(const DataContent::Ptr& origContent,
+ const DataDesc& origWeightsDesc,
+ const DataDesc& resDesc,
+ int numInputChannels,
+ int channelStartIndex) :
+ _origContent(origContent),
+ _origDesc(origWeightsDesc),
+ _resDesc(resDesc),
+ _numInputChannels(numInputChannels),
+ _channelStartIndex(channelStartIndex) {
+}
+
+size_t HwWeightsContent::byteSize() const {
+ return checked_cast<size_t>(_resDesc.totalDimSize()) *
+ checked_cast<size_t>(_resDesc.elemSize());
+}
+
+void HwWeightsContent::fillTempBuf(void* tempBuf) const {
+ VPU_PROFILE(HwWeightsContent);
+
+ IE_ASSERT(_resDesc.type() == DataType::FP16);
+
+ const auto KX = _origDesc.dim(Dim::W);
+ const auto KY = _origDesc.dim(Dim::H);
+ const auto IC = _origDesc.dim(Dim::C);
+ const auto OC = _origDesc.dim(Dim::N);
+ const auto origTotalSize = _origDesc.totalDimSize();
+
+ const auto HW_OC_inner = _resDesc.dim(Dim::W);
+ const auto HW_OC_outer = _resDesc.dim(Dim::N);
+ IE_ASSERT(HW_OC_outer * HW_OC_inner >= OC);
+
+ const auto HW_K = _resDesc.dim(Dim::H);
+ IE_ASSERT(HW_K == KX * KY);
+
+ IE_ASSERT(_channelStartIndex < IC);
+ const auto HW_IC = _resDesc.dim(Dim::C);
+ const auto HW_IC_real = std::min(_numInputChannels, IC - _channelStartIndex);
+
+ const auto srcData = _origContent->get<fp16_t>();
+ IE_ASSERT(srcData != nullptr);
+
+ auto dstData = static_cast<fp16_t*>(tempBuf);
+
+ IE_ASSERT((_channelStartIndex + HW_IC_real) * HW_K + (OC - 1) * HW_K * IC - 1 < origTotalSize);
+ IE_ASSERT((OC - 1) % HW_OC_inner +
+ (HW_K - 1) * HW_OC_inner +
+ (HW_IC_real - 1) * HW_OC_inner * HW_K +
+ ((OC - 1) / 8) * HW_OC_inner * HW_K * HW_IC < _resDesc.totalDimSize());
+
+ if (KX == 1 && KY == 1) {
+ ie::parallel_for(OC, [=](int oc) {
+ const auto oc_inner = oc % HW_OC_inner;
+ const auto oc_outer = oc / HW_OC_inner;
+ for (int ic = 0; ic < HW_IC_real; ++ic) {
+ const auto srcInd =
+ (_channelStartIndex + ic) +
+ oc * IC;
+ const auto dstInd =
+ oc_inner +
+ ic * HW_OC_inner * HW_K +
+ oc_outer * HW_OC_inner * HW_K * HW_IC;
+
+ dstData[dstInd] = srcData[srcInd];
+ }
+ });
+ } else {
+ ie::parallel_for(OC, [=](int oc) {
+ const auto oc_inner = oc % HW_OC_inner;
+ const auto oc_outer = oc / HW_OC_inner;
+ for (int ic = 0; ic < HW_IC_real; ++ic) {
+ for (int ky = 0; ky < KY; ++ky) {
+ for (int kx = 0; kx < KX; ++kx) {
+ const auto srcInd =
+ (kx + ky * KX) +
+ (_channelStartIndex + ic) * HW_K +
+ oc * HW_K * IC;
+ const auto dstInd =
+ oc_inner +
+ (ky * KX + kx) * HW_OC_inner +
+ ic * HW_OC_inner * HW_K +
+ oc_outer * HW_OC_inner * HW_K * HW_IC;
+
+ dstData[dstInd] = srcData[srcInd];
+ }
+ }
+ }
+ });
+ }
+}
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/ie_blob_content.hpp>
+
+#include <vpu/utils/ie_helpers.hpp>
+
+namespace vpu {
+
+IeBlobContent::IeBlobContent(const ie::Blob::CPtr& blob, DataType resultDataType) : _blob(blob), _resultDataType(resultDataType) {
+ VPU_THROW_UNLESS(_resultDataType == DataType::FP16 || _resultDataType == DataType::S32,
+ "IeBlobContent creation error: {} result type is unsupported, only {} and {} are supported",
+ _resultDataType, DataType::FP16, DataType::S32);
+}
+
+size_t IeBlobContent::byteSize() const {
+ // Result can be converted into type with another size
+ const auto elementSize = _resultDataType == DataType::FP16 ? sizeof(fp16_t) : sizeof(int32_t);
+ return elementSize * _blob->size();
+}
+
+const void* IeBlobContent::getRaw() const {
+ if (_resultDataType == DataType::FP16) {
+ if (_blobFp16 == nullptr) {
+ _blobFp16 = _blob->getTensorDesc().getPrecision() == ie::Precision::FP16 ?
+ _blob : convertBlobFP32toFP16(_blob);
+ }
+ return _blobFp16->cbuffer();
+ } else { // S32
+ return _blob->cbuffer();
+ }
+}
+
+DataContent::Ptr ieBlobContent(const ie::Blob::CPtr& blob, DataType resultDataType) {
+ return std::make_shared<IeBlobContent>(blob, resultDataType);
+}
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/kernel_binary_content.hpp>
+
+#include <string>
+
+namespace vpu {
+
+KernelBinaryContent::KernelBinaryContent(const std::string& blob) : _blob(blob) {
+ IE_ASSERT(!_blob.empty());
+}
+
+size_t KernelBinaryContent::byteSize() const {
+ return _blob.size();
+}
+
+const void* KernelBinaryContent::getRaw() const {
+ return _blob.data();
+}
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/mean_contents.hpp>
+
+#include <vpu/utils/profiling.hpp>
+#include <vpu/middleend/sw/utility.hpp>
+
+#include <ie_parallel.hpp>
+#include <precision_utils.h>
+
+namespace vpu {
+
+//
+// MeanImageContent
+//
+
+MeanImageContent::MeanImageContent(const ie::PreProcessInfo& info, const DataDesc& desc) : _info(info), _desc(desc) {}
+
+size_t MeanImageContent::byteSize() const {
+ size_t countElem = checked_cast<size_t>(_desc.dim(Dim::W) * _desc.dim(Dim::H) * _desc.dim(Dim::C));
+ if (_desc.dimsOrder() == DimsOrder::NHWC || _desc.dimsOrder() == DimsOrder::HWC) {
+ countElem *= 2;
+ }
+
+ return countElem * sizeof(fp16_t);
+}
+
+void MeanImageContent::fillTempBuf(void* tempBuf) const {
+ VPU_PROFILE(MeanImageContent);
+
+ const size_t numOfChannel = _info.getNumberOfChannels();
+
+ const size_t imagePixels = checked_cast<size_t>(_desc.dim(Dim::W) * _desc.dim(Dim::H));
+ const size_t countElem = checked_cast<size_t>(_desc.dim(Dim::W) * _desc.dim(Dim::H) * _desc.dim(Dim::C));
+
+ const auto dstPtr = static_cast<fp16_t*>(tempBuf);
+
+ auto dstPtr2 = dstPtr;
+ if (_desc.dimsOrder() == DimsOrder::NHWC || _desc.dimsOrder() == DimsOrder::HWC) {
+ dstPtr2 += countElem;
+ }
+
+ ie::parallel_for(numOfChannel, [=](size_t i) {
+ const auto meanDataBlob = _info[i]->meanData;
+
+ ie::PrecisionUtils::f32tof16Arrays(
+ dstPtr2 + i * imagePixels,
+ meanDataBlob->buffer().as<const float*>(),
+ imagePixels,
+ -1.0f);
+ });
+
+ if (_desc.dimsOrder() == DimsOrder::NHWC || _desc.dimsOrder() == DimsOrder::HWC) {
+ kchw_to_hwck(dstPtr2, dstPtr, _desc);
+ }
+}
+
+//
+// MeanValueContent
+//
+
+MeanValueContent::MeanValueContent(const ie::PreProcessInfo& info) : _info(info) {}
+
+size_t MeanValueContent::byteSize() const {
+ return _info.getNumberOfChannels() * sizeof(fp16_t);
+}
+
+void MeanValueContent::fillTempBuf(void* tempBuf) const {
+ VPU_PROFILE(MeanValueContent);
+
+ const auto dstPtr = static_cast<fp16_t*>(tempBuf);
+
+ ie::parallel_for(_info.getNumberOfChannels(), [dstPtr, this](size_t i) {
+ dstPtr[i] = ie::PrecisionUtils::f32tof16(-_info[i]->meanValue);
+ });
+}
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/merge_fc_content.hpp>
+
+#include <ie_parallel.hpp>
+
+#include <numeric>
+
+namespace vpu {
+
+MergeFullyConnectedContentsByChannels::MergeFullyConnectedContentsByChannels(const std::vector<DataContent::CPtr> contents,
+ const std::vector<DataDesc> inDescs,
+ const DataDesc& resDesc) :
+ _contents(contents), _inDescs(inDescs), _resDesc(resDesc) {}
+
+size_t MergeFullyConnectedContentsByChannels::byteSize() const {
+ return checked_cast<size_t>(_resDesc.totalDimSize()) *
+ checked_cast<size_t>(_resDesc.elemSize());
+}
+
+void MergeFullyConnectedContentsByChannels::fillTempBuf(void* temp) const {
+ IE_ASSERT(!_contents.empty());
+ // vpu::DataNode has content and vpu::DataDesc with dimensions' vector
+ // content has dimensions's vector as well
+ // they can be different so we extract channels number from contents
+ const auto dstC = std::accumulate(_inDescs.begin(), _inDescs.end(), 0, [](int reduction, const DataDesc& desc) {
+ return reduction + desc.dims()[Dim::C];});
+
+ for (std::size_t i = 0, dstChannelsOffset = 0; i < _inDescs.size(); ++i) {
+ const auto& content = _contents[i];
+ const auto& srcDesc = _inDescs[i];
+
+ const auto& srcDims = srcDesc.dims();
+ const auto& elemSize = srcDesc.elemSize();
+
+ const auto N = srcDims.get(Dim::N, 1);
+ const auto H = srcDims.get(Dim::H, 1);
+ const auto W = srcDims.get(Dim::W, 1) * elemSize;
+
+ const auto& srcC = srcDims[Dim::C];
+
+ const auto src = content->get<uint8_t>();
+ auto dst = static_cast<uint8_t*>(temp);
+
+ InferenceEngine::parallel_for4d(N, srcC, H, W, [dstChannelsOffset, N, H, W, src, dst, srcC, dstC](int n, int c, int h, int w) {
+ const auto& srcc = c;
+ const auto& dstc = dstChannelsOffset + c;
+
+ const auto& srcOffset = n * H * W * srcC + srcc * H * W + h * W + w;
+ const auto& dstOffset = n * H * W * dstC + dstc * H * W + h * W + w;
+ dst[dstOffset] = src[srcOffset];
+ });
+
+ dstChannelsOffset += srcC;
+ }
+}
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/mtcnn_blob_content.hpp>
+
+namespace vpu {
+
+MTCNNBlobContent::MTCNNBlobContent(std::vector<char> blob) : _blob(std::move(blob)) {
+ IE_ASSERT(!_blob.empty());
+}
+
+size_t MTCNNBlobContent::byteSize() const {
+ return _blob.size();
+}
+
+const void* MTCNNBlobContent::getRaw() const {
+ return _blob.data();
+}
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/prelu_blob_content.hpp>
+
+#include <vpu/utils/ie_helpers.hpp>
+#include <vpu/utils/profiling.hpp>
+
+#include <ie_parallel.hpp>
+
+namespace vpu {
+
+PReLUBlobContent::PReLUBlobContent(const ie::Blob::CPtr& blob, const DataDesc& desc, int repeat) :
+ _blob(blob), _desc(desc), _repeat(repeat) {
+ VPU_INTERNAL_CHECK(repeat >= 1,
+ "PReLUBlobContent only supports repeat value more than 1, actual is {}", repeat);
+}
+
+size_t PReLUBlobContent::byteSize() const {
+ return checked_cast<size_t>(_desc.totalDimSize()) *
+ checked_cast<size_t>(_desc.elemSize());
+}
+
+const void* PReLUBlobContent::getRaw() const {
+ if (_blobFp16 == nullptr) {
+ _blobFp16 = _blob->getTensorDesc().getPrecision() == ie::Precision::FP16 ?
+ _blob : convertBlobFP32toFP16(_blob);
+ }
+
+ if (_repeat == 1) {
+ return _blobFp16->cbuffer();
+ }
+
+ if (_tempFp16.empty()) {
+ VPU_PROFILE(PReLUBlobContent);
+
+ IE_ASSERT(_desc.totalDimSize() % _repeat == 0);
+
+ auto origNumElems = _desc.totalDimSize() / _repeat;
+ IE_ASSERT(checked_cast<size_t>(origNumElems) <= _blobFp16->size());
+
+ auto origPtr = _blobFp16->cbuffer().as<const fp16_t*>();
+ IE_ASSERT(origPtr != nullptr);
+
+ _tempFp16.resize(checked_cast<size_t>(_desc.totalDimSize()));
+
+ ie::parallel_for(_repeat, [this, origPtr, origNumElems](int i) {
+ std::copy_n(origPtr, origNumElems, _tempFp16.data() + i * origNumElems);
+ });
+ }
+
+ return _tempFp16.data();
+}
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/priorbox_contents.hpp>
+
+#include <vpu/utils/profiling.hpp>
+
+#include <precision_utils.h>
+#include <ie_layers.h>
+#include <ie_parallel.hpp>
+
+namespace vpu {
+
+//
+// PriorBoxContent
+//
+
+PriorBoxContent::PriorBoxContent(
+ const DataDesc& inDesc0,
+ const DataDesc& inDesc1,
+ const DataDesc& outDesc,
+ const ie::CNNLayerPtr &layer) :
+ _inDesc0(inDesc0), _inDesc1(inDesc1), _outDesc(outDesc),
+ _layer(layer) {
+ IE_ASSERT(layer != nullptr);
+}
+
+size_t PriorBoxContent::byteSize() const {
+ return checked_cast<size_t>(_outDesc.totalDimSize()) *
+ checked_cast<size_t>(_outDesc.elemSize());
+}
+
+void PriorBoxContent::fillTempBuf(void* tempBuf) const {
+ VPU_PROFILE(PriorBoxContent);
+
+ auto tempPtr = static_cast<fp16_t*>(tempBuf);
+
+ auto _min_sizes = _layer->GetParamAsFloats("min_size", {});
+ auto _max_sizes = _layer->GetParamAsFloats("max_size", {});
+ auto aspect_ratios = _layer->GetParamAsFloats("aspect_ratio");
+ auto _flip = static_cast<bool>(_layer->GetParamAsInt("flip"));
+ auto _clip = static_cast<bool>(_layer->GetParamAsInt("clip"));
+ auto _variance = _layer->GetParamAsFloats("variance");
+ auto _img_h = _layer->GetParamAsInt("img_h", 0);
+ auto _img_w = _layer->GetParamAsInt("img_w", 0);
+ auto _step = _layer->GetParamAsFloat("step", 0);
+ auto _offset = _layer->GetParamAsFloat("offset", 0);
+ auto _scale_all_sizes = static_cast<bool>(_layer->GetParamAsInt("scale_all_sizes", 1));
+
+ auto _fixed_sizes = _layer->GetParamAsFloats("fixed_size", {});
+ auto _fixed_ratios = _layer->GetParamAsFloats("fixed_ratio", {});
+ auto _densitys = _layer->GetParamAsFloats("density", {});
+
+ SmallVector<float> _aspect_ratios;
+ _aspect_ratios.reserve(aspect_ratios.size() + 1);
+
+ _aspect_ratios.push_back(1.0f);
+ for (const auto& aspect_ratio : aspect_ratios) {
+ bool exist = false;
+
+ for (const auto& _aspect_ratio : _aspect_ratios) {
+ if (fabsf(aspect_ratio - _aspect_ratio) < 1e-6) {
+ exist = true;
+ break;
+ }
+ }
+ if (!exist) {
+ _aspect_ratios.push_back(aspect_ratio);
+ if (_flip) {
+ if (isFloatEqual(aspect_ratio, 0.f)) {
+ THROW_IE_EXCEPTION << "[VPU] PriorBox has 0.0 aspect ratio param in flip mode, "
+ << " possible division by zero";
+ }
+ _aspect_ratios.push_back(1.0f / aspect_ratio);
+ }
+ }
+ }
+
+ int _num_priors;
+ if (_scale_all_sizes) {
+ _num_priors = static_cast<int>(_aspect_ratios.size() * _min_sizes.size());
+ } else {
+ _num_priors = static_cast<int>(_aspect_ratios.size() + _min_sizes.size() - 1);
+ }
+
+ if (!_fixed_sizes.empty()) {
+ _num_priors = static_cast<int>(_aspect_ratios.size() * _fixed_sizes.size());
+ }
+
+ if (!_densitys.empty()) {
+ for (const auto& _density : _densitys) {
+ if (!_fixed_ratios.empty()) {
+ _num_priors += _fixed_ratios.size() * (static_cast<int>(pow(_density, 2)) - 1);
+ } else {
+ _num_priors += _aspect_ratios.size() * (static_cast<int>(pow(_density, 2)) - 1);
+ }
+ }
+ }
+
+ _num_priors += _max_sizes.size();
+
+ auto W = _inDesc0.dim(Dim::W);
+ auto H = _inDesc0.dim(Dim::H);
+ auto IW = _img_w == 0 ? _inDesc1.dim(Dim::W) : _img_w;
+ auto IH = _img_h == 0 ? _inDesc1.dim(Dim::H) : _img_h;
+ auto IWI = 1.0f / static_cast<float>(IW);
+ auto IHI = 1.0f / static_cast<float>(IH);
+
+ auto OW = (_outDesc.numDims() >= 4) ? _outDesc.dim(Dim::N) : 1;
+ auto OH = _outDesc.dim(Dim::W);
+
+ float step_x = 0.0f;
+ float step_y = 0.0f;
+
+ if (_step == 0) {
+ step_x = static_cast<float>(IW) / W;
+ step_y = static_cast<float>(IH) / H;
+ } else {
+ step_x = _step;
+ step_y = _step;
+ }
+
+ auto dst_data = tempPtr;
+
+ int dim = H * W * _num_priors * 4;
+ float center_x = 0.0f;
+ float center_y = 0.0f;
+
+ float box_width = 0.0f;
+ float box_height = 0.0f;
+
+ if (_outDesc.dim(Dim::W) != dim || _outDesc.dim(Dim::H) != 2) {
+ THROW_IE_EXCEPTION << "[VPU] PriorBox output have invalid dimension, exptected " << dim << "x2"
+ << ", got " << _outDesc.dim(Dim::W) << "x" << _outDesc.dim(Dim::H)
+ << ", layer name is: " << _layer->name;
+ }
+
+ auto max_fp16 = [](const float value, const float min) {
+ return ie::PrecisionUtils::f32tof16(value > min ? value : min);
+ };
+
+ auto min_fp16 = [](const float value, const float max) {
+ return ie::PrecisionUtils::f32tof16(value < max ? value : max);
+ };
+
+ size_t idx = 0;
+ for (int h = 0; h < H; ++h) {
+ for (int w = 0; w < W; ++w) {
+ if (_step == 0) {
+ center_x = (static_cast<float>(w) + 0.5f) * step_x;
+ center_y = (static_cast<float>(h) + 0.5f) * step_y;
+ } else {
+ center_x = (_offset + static_cast<float>(w)) * _step;
+ center_y = (_offset + static_cast<float>(h)) * _step;
+ }
+
+ for (size_t s = 0; s < _fixed_sizes.size(); ++s) {
+ auto fixed_size_ = static_cast<size_t>(_fixed_sizes[s]);
+ box_width = box_height = fixed_size_ * 0.5f;
+
+ int density_ = 0;
+ int shift = 0;
+ if (s < _densitys.size()) {
+ density_ = static_cast<size_t>(_densitys[s]);
+ shift = static_cast<int>(_fixed_sizes[s] / density_);
+ }
+
+ if (!_fixed_ratios.empty()) {
+ for (const auto& fr : _fixed_ratios) {
+ const auto box_width_ratio = _fixed_sizes[s] * 0.5f * std::sqrt(fr);
+ const auto box_height_ratio = _fixed_sizes[s] * 0.5f / std::sqrt(fr);
+
+ for (size_t r = 0; r < density_; ++r) {
+ for (size_t c = 0; c < density_; ++c) {
+ const auto center_x_temp = center_x - fixed_size_ / 2 + shift / 2.f + c * shift;
+ const auto center_y_temp = center_y - fixed_size_ / 2 + shift / 2.f + r * shift;
+
+ dst_data[idx++] = max_fp16((center_x_temp - box_width_ratio) * IWI, 0.f);
+ dst_data[idx++] = max_fp16((center_y_temp - box_height_ratio) * IHI, 0.f);
+ dst_data[idx++] = min_fp16((center_x_temp + box_width_ratio) * IWI, 1.f);
+ dst_data[idx++] = min_fp16((center_y_temp + box_height_ratio) * IHI, 1.f);
+ }
+ }
+ }
+ } else {
+ if (!_densitys.empty()) {
+ for (int r = 0; r < density_; ++r) {
+ for (int c = 0; c < density_; ++c) {
+ const auto center_x_temp = center_x - fixed_size_ / 2 + shift / 2.f + c * shift;
+ const auto center_y_temp = center_y - fixed_size_ / 2 + shift / 2.f + r * shift;
+
+ dst_data[idx++] = max_fp16((center_x_temp - box_width) * IWI, 0.f);
+ dst_data[idx++] = max_fp16((center_y_temp - box_height) * IHI, 0.f);
+ dst_data[idx++] = min_fp16((center_x_temp + box_width) * IWI, 1.f);
+ dst_data[idx++] = min_fp16((center_y_temp + box_height) * IHI, 1.f);
+ }
+ }
+ }
+ // Rest of priors
+ for (const auto& ar : _aspect_ratios) {
+ if (fabs(ar - 1.) < 1e-6) {
+ continue;
+ }
+
+ const auto box_width_ratio = _fixed_sizes[s] * 0.5f * std::sqrt(ar);
+ const auto box_height_ratio = _fixed_sizes[s] * 0.5f / std::sqrt(ar);
+ for (int r = 0; r < density_; ++r) {
+ for (int c = 0; c < density_; ++c) {
+ const auto center_x_temp = center_x - fixed_size_ / 2 + shift / 2.f + c * shift;
+ const auto center_y_temp = center_y - fixed_size_ / 2 + shift / 2.f + r * shift;
+
+ dst_data[idx++] = max_fp16((center_x_temp - box_width_ratio) * IWI, 0.f);
+ dst_data[idx++] = max_fp16((center_y_temp - box_height_ratio) * IHI, 0.f);
+ dst_data[idx++] = min_fp16((center_x_temp + box_width_ratio) * IWI, 1.f);
+ dst_data[idx++] = min_fp16((center_y_temp + box_height_ratio) * IHI, 1.f);
+ }
+ }
+ }
+ }
+ }
+
+ for (size_t msIdx = 0; msIdx < _min_sizes.size(); msIdx++) {
+ box_width = _min_sizes[msIdx];
+ box_height = _min_sizes[msIdx];
+
+ dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x - box_width / 2.0f) / IW);
+ dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y - box_height / 2.0f) / IH);
+ dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x + box_width / 2.0f) / IW);
+ dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y + box_height / 2.0f) / IH);
+
+ if (_max_sizes.size() > msIdx) {
+ box_width = box_height = std::sqrt(_min_sizes[msIdx] * _max_sizes[msIdx]);
+
+ dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x - box_width / 2.0f) / IW);
+ dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y - box_height / 2.0f) / IH);
+ dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x + box_width / 2.0f) / IW);
+ dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y + box_height / 2.0f) / IH);
+ }
+
+ if (_scale_all_sizes || (!_scale_all_sizes && (msIdx == _min_sizes.size() - 1))) {
+ size_t sIdx = _scale_all_sizes ? msIdx : 0;
+ for (const auto& ar : _aspect_ratios) {
+ if (std::fabs(ar - 1.0f) < 1e-6) {
+ continue;
+ }
+
+ box_width = _min_sizes[sIdx] * std::sqrt(ar);
+ box_height = _min_sizes[sIdx] / std::sqrt(ar);
+
+ dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x - box_width / 2.0f) / IW);
+ dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y - box_height / 2.0f) / IH);
+ dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x + box_width / 2.0f) / IW);
+ dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y + box_height / 2.0f) / IH);
+ }
+ }
+ }
+ }
+ }
+
+ if (_clip) {
+ for (int d = 0; d < dim; ++d) {
+ dst_data[d] = (std::min)((std::max)(dst_data[d], ie::PrecisionUtils::f32tof16(0.0f)), ie::PrecisionUtils::f32tof16(1.0f));
+ }
+ }
+
+ int channel_size = OH * OW;
+
+ dst_data += channel_size;
+
+ if (_variance.size() == 1) {
+ ie::parallel_for(channel_size, [&](int i) {
+ dst_data[i] = ie::PrecisionUtils::f32tof16(_variance[0]);
+ });
+ } else {
+ ie::parallel_for4d(H, W, _num_priors, 4, [&](int h, int w, int i, int j) {
+ dst_data[j + 4 * (i + _num_priors * (w + W * h))] = ie::PrecisionUtils::f32tof16(_variance[j]);
+ });
+ }
+}
+
+//
+// PriorBoxClusteredContent
+//
+
+PriorBoxClusteredContent::PriorBoxClusteredContent(
+ const DataDesc& inDesc0,
+ const DataDesc& inDesc1,
+ const DataDesc& outDesc,
+ const ie::CNNLayerPtr& layer) :
+ _inDesc0(inDesc0), _inDesc1(inDesc1), _outDesc(outDesc),
+ _layer(layer) {
+ IE_ASSERT(layer != nullptr);
+}
+
+size_t PriorBoxClusteredContent::byteSize() const {
+ return checked_cast<size_t>(_outDesc.totalDimSize()) *
+ checked_cast<size_t>(_outDesc.elemSize());
+}
+
+void PriorBoxClusteredContent::fillTempBuf(void* tempBuf) const {
+ VPU_PROFILE(PriorBoxClusteredContent);
+
+ auto tempPtr = static_cast<fp16_t*>(tempBuf);
+
+ auto widths_ = _layer->GetParamAsFloats("width");
+ auto heights_ = _layer->GetParamAsFloats("height");
+ auto clip_ = _layer->GetParamAsInt("clip");
+ auto variance_ = _layer->GetParamAsFloats("variance");
+ auto img_h_ = _layer->GetParamAsInt("img_h", 0);
+ auto img_w_ = _layer->GetParamAsInt("img_w", 0);
+ auto step_ = _layer->GetParamAsFloat("step", 0);
+ auto step_h_ = _layer->GetParamAsFloat("step_h", 0);
+ auto step_w_ = _layer->GetParamAsFloat("step_w", 0);
+ auto offset_ = _layer->GetParamAsFloat("offset", 0);
+
+ auto num_priors_ = widths_.size();
+
+ if (variance_.empty()) {
+ variance_.push_back(0.1);
+ }
+
+ auto layer_width = _inDesc0.dim(Dim::W);
+ auto layer_height = _inDesc0.dim(Dim::H);
+
+ auto img_width = img_w_ == 0 ? _inDesc1.dim(Dim::W) : img_w_;
+ auto img_height = img_h_ == 0 ? _inDesc1.dim(Dim::H) : img_h_;
+
+ auto step_w = step_w_ == 0 ? step_ : step_w_;
+ auto step_h = step_h_ == 0 ? step_ : step_h_;
+ if (step_w == 0 || step_h == 0) {
+ step_w = static_cast<float>(img_width) / layer_width;
+ step_h = static_cast<float>(img_height) / layer_height;
+ }
+
+ auto expetected_output_dimx = layer_height * layer_width * num_priors_ * 4;
+ if (_outDesc.dim(Dim::W) != expetected_output_dimx || _outDesc.dim(Dim::H) != 2) {
+ THROW_IE_EXCEPTION << "PriorBoxClustered output has invalid dimension, exptected " << expetected_output_dimx << "x2"
+ << ", got " << _outDesc.dim(Dim::W) << "x" << _outDesc.dim(Dim::H) << ", layer name is: " << _layer->name;
+ }
+
+ auto offset = _outDesc.dim(Dim::W);
+ auto var_size = variance_.size();
+
+ auto top_data_0 = tempPtr;
+ auto top_data_1 = top_data_0 + offset;
+
+ ie::parallel_for2d(layer_height, layer_width, [=](int h, int w) {
+ auto center_x = (w + offset_) * step_w;
+ auto center_y = (h + offset_) * step_h;
+
+ for (int s = 0; s < num_priors_; ++s) {
+ auto box_width = widths_[s];
+ auto box_height = heights_[s];
+
+ auto xmin = (center_x - box_width / 2.0f) / img_width;
+ auto ymin = (center_y - box_height / 2.0f) / img_height;
+ auto xmax = (center_x + box_width / 2.0f) / img_width;
+ auto ymax = (center_y + box_height / 2.0f) / img_height;
+
+ if (clip_) {
+ xmin = std::min(std::max(xmin, 0.0f), 1.0f);
+ ymin = std::min(std::max(ymin, 0.0f), 1.0f);
+ xmax = std::min(std::max(xmax, 0.0f), 1.0f);
+ ymax = std::min(std::max(ymax, 0.0f), 1.0f);
+ }
+
+ top_data_0[h * layer_width * num_priors_ * 4 + w * num_priors_ * 4 + s * 4 + 0] = ie::PrecisionUtils::f32tof16(xmin);
+ top_data_0[h * layer_width * num_priors_ * 4 + w * num_priors_ * 4 + s * 4 + 1] = ie::PrecisionUtils::f32tof16(ymin);
+ top_data_0[h * layer_width * num_priors_ * 4 + w * num_priors_ * 4 + s * 4 + 2] = ie::PrecisionUtils::f32tof16(xmax);
+ top_data_0[h * layer_width * num_priors_ * 4 + w * num_priors_ * 4 + s * 4 + 3] = ie::PrecisionUtils::f32tof16(ymax);
+
+ for (int j = 0; j < var_size; j++) {
+ auto index = h * layer_width * num_priors_ * var_size + w * num_priors_ * var_size + s * var_size + j;
+ top_data_1[index] = ie::PrecisionUtils::f32tof16(variance_[j]);
+ }
+ }
+ });
+}
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/replicated_data_content.hpp>
+
+#include <vpu/utils/profiling.hpp>
+
+#include <ie_parallel.hpp>
+#include <precision_utils.h>
+
+namespace vpu {
+
+ReplicatedContent::ReplicatedContent(float val, int count, const DataDesc& desc) :
+ _factor{val}, _count(count), _desc(desc) {}
+
+ReplicatedContent::ReplicatedContent(DataContent::Ptr origContent, int count, const DataDesc& desc) :
+ _origContent(origContent), _count(count), _desc(desc) {}
+
+size_t ReplicatedContent::byteSize() const {
+ if (!_origContent) {
+ return checked_cast<size_t>(_count) * sizeof(fp16_t);
+ } else {
+ IE_ASSERT(_desc.totalDimSize() % _count == 0);
+
+ return checked_cast<size_t>(_desc.totalDimSize()) * sizeof(fp16_t);
+ }
+}
+
+void ReplicatedContent::fillTempBuf(void* tempBuf) const {
+ VPU_PROFILE(ReplicatedContent);
+
+ auto dstPtr = static_cast<fp16_t*>(tempBuf);
+
+ if (!_origContent) {
+ std::fill_n(dstPtr, _count, ie::PrecisionUtils::f32tof16(_factor));
+ } else {
+ IE_ASSERT(_desc.totalDimSize() % _count == 0);
+
+ auto origCount = _desc.totalDimSize() / _count;
+ auto origPtr = _origContent->get<fp16_t>();
+ IE_ASSERT(origPtr != nullptr);
+
+ ie::parallel_for(_count, [origPtr, origCount, dstPtr](int i) {
+ std::copy_n(origPtr, origCount, dstPtr + i * origCount);
+ });
+ }
+}
+
+DataContent::Ptr replicateContent(float val, int count, const DataDesc& desc) {
+ return std::make_shared<ReplicatedContent>(val, count, desc);
+}
+
+DataContent::Ptr replicateContent(const DataContent::Ptr& origContent, int count, const DataDesc& desc) {
+ return std::make_shared<ReplicatedContent>(origContent, count, desc);
+}
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/scaled_content.hpp>
+
+#include <vpu/utils/profiling.hpp>
+
+#include <ie_parallel.hpp>
+#include <precision_utils.h>
+
+namespace vpu {
+
+ScaledContent::ScaledContent(const DataContent::Ptr& origContent, float scale) :
+ _origContent(origContent), _factor(scale) {
+}
+
+size_t ScaledContent::byteSize() const {
+ return _origContent->byteSize();
+}
+
+void ScaledContent::fillTempBuf(void *tempBuf) const {
+ VPU_PROFILE(ScaledContent);
+
+ const auto totalSize = _origContent->byteSize() / sizeof(fp16_t);
+
+ auto srcPtr = _origContent->get<fp16_t>();
+ IE_ASSERT(srcPtr != nullptr);
+
+ auto dstPtr = static_cast<fp16_t*>(tempBuf);
+
+ ie::parallel_for(totalSize, [this, srcPtr, dstPtr](int i) {
+ dstPtr[i] = ie::PrecisionUtils::f32tof16(ie::PrecisionUtils::f16tof32(srcPtr[i]) * _factor);
+ });
+}
+
+DataContent::Ptr scaleContent(const DataContent::Ptr& origContent, float scale) {
+ return std::make_shared<ScaledContent>(origContent, scale);
+}
+
+} // namespace vpu
#include <vpu/model/model.hpp>
+#include <vpu/compile_env.hpp>
+#include <vpu/utils/auto_scope.hpp>
+#include <vpu/utils/profiling.hpp>
+#include <vpu/model/data_contents/ie_blob_content.hpp>
+
+#include <details/caseless.hpp>
+#include "blob_factory.hpp"
+
#include <cctype>
#include <memory>
#include <string>
#include <exception>
#include <algorithm>
-#include <details/caseless.hpp>
-
-#include <vpu/compile_env.hpp>
-#include <vpu/utils/auto_scope.hpp>
-#include <vpu/utils/profiling.hpp>
-
-#include "blob_factory.hpp"
-
namespace vpu {
//
const DataContent::Ptr& content) {
IE_ASSERT(content != nullptr);
+ VPU_THROW_UNLESS(desc.totalDimSize() * desc.elemSize() == content->byteSize(),
+ "duplicateData error: while duplicating {} Const data got different "
+ "newDesc and content byte sizes ({} and {} respectively)",
+ name, desc.totalDimSize() * desc.elemSize(), content->byteSize());
+
std::shared_ptr<DataNode> data(new DataNode);
data->_name = name;
data->_model = this;
data->_content = content;
- content->_desc = desc;
data->_ptrPosInModel = _dataPtrList.emplace(_dataPtrList.end(), data);
_dataList.push_back(data);
if (generator) {
generator(ieBlob);
}
- return addConstData(name, descriptor, ieBlobContent(ieBlob));
+ return addConstData(name, descriptor, ieBlobContent(ieBlob, descriptor.type()));
}
Data ModelObj::addNewData(
newData->_model = this;
if (newDataUsage == DataUsage::Const) {
- newData->_content = newContent != nullptr ? newContent : origData->content();
- if (newContent != nullptr) {
- newContent->_desc = newData->_desc;
- }
+ const auto& content = newContent != nullptr ? newContent : origData->content();
+ const auto& desc = newDesc != DataDesc() ? newDesc : origData->desc();
+
+ VPU_THROW_UNLESS(desc.totalDimSize() * desc.elemSize() == content->byteSize(),
+ "duplicateData error: while duplicating {} Const data got different "
+ "desc and content byte sizes ({} and {} respectively)",
+ origData->name(), desc.totalDimSize() * desc.elemSize(), content->byteSize());
+
+ newData->_content = content;
}
newData->attrs().copyFrom(origData->attrs());
#include <vpu/frontend/frontend.hpp>
-#include <cmath>
-
-#include <vector>
-#include <memory>
-
-#include <precision_utils.h>
-#include <ie_parallel.hpp>
-
#include <vpu/utils/ie_helpers.hpp>
#include <vpu/utils/numeric.hpp>
#include <vpu/utils/profiling.hpp>
+#include <vpu/model/data_contents/batch_norm_contents.hpp>
-namespace vpu {
-
-namespace {
-
-class BatchNormalizationWeightsContent final : public CalculatedDataContent {
-public:
- BatchNormalizationWeightsContent(
- const DataContent::Ptr& origContent,
- float epsilon) :
- CalculatedDataContent({origContent}), _epsilon(epsilon) {
- }
-
-protected:
- void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
- VPU_PROFILE(BatchNormalizationWeightsContent);
-
- auto srcPtr = baseContents[0]->get<fp16_t>();
- auto dstPtr = static_cast<fp16_t*>(tempBuf);
-
- ie::parallel_for(desc().totalDimSize(), [this, srcPtr, dstPtr](int i) {
- float val = ie::PrecisionUtils::f16tof32(srcPtr[i]) + _epsilon;
- val = 1.0f / std::sqrt(val);
- dstPtr[i] = ie::PrecisionUtils::f32tof16(val);
- });
- }
-
-private:
- float _epsilon;
-};
-
-class BatchNormalizationBiasesContent final : public CalculatedDataContent {
-public:
- BatchNormalizationBiasesContent(
- const DataContent::Ptr& origContent,
- const DataContent::Ptr& weightsContent) :
- CalculatedDataContent({origContent, weightsContent}) {
- }
-
-protected:
- void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
- VPU_PROFILE(BatchNormalizationBiasesContent);
-
- auto origPtr = baseContents[0]->get<fp16_t>();
- auto weightsPtr = baseContents[1]->get<fp16_t>();
-
- auto dstPtr = static_cast<fp16_t*>(tempBuf);
-
- ie::parallel_for(desc().totalDimSize(), [origPtr, weightsPtr, dstPtr](int i) {
- // TODO : need to be extracted from IE layer.
- float beta = 0.0f;
+#include <precision_utils.h>
+#include <ie_parallel.hpp>
- auto wVal = ie::PrecisionUtils::f16tof32(weightsPtr[i]);
- dstPtr[i] = ie::PrecisionUtils::f32tof16(beta - wVal * ie::PrecisionUtils::f16tof32(origPtr[i]));
- });
- }
-};
+#include <cmath>
+#include <vector>
+#include <memory>
-} // namespace
+namespace vpu {
void FrontEnd::parseBatchNorm(const Model& model, const ie::CNNLayerPtr& _layer, const DataVector& inputs, const DataVector& outputs) const {
IE_ASSERT(inputs.size() == 1);
#include <vpu/frontend/frontend.hpp>
+#include <vpu/frontend/custom_layer.hpp>
+#include <vpu/utils/simple_math.hpp>
+#include <vpu/model/data_contents/kernel_binary_content.hpp>
+#include <vpu/model/data_contents/ie_blob_content.hpp>
+
#include <vector>
#include <memory>
#include <string>
#include <algorithm>
#include <tuple>
-#include <vpu/frontend/custom_layer.hpp>
-#include <vpu/utils/simple_math.hpp>
-
-
namespace vpu {
static void calcSizesFromParams(const DataDesc &desc, const SmallVector<std::string> &bufferSizeRules, SmallVector<int, 3> &sizes);
namespace {
-class KernelBinaryContent final : public DataContent {
-public:
- explicit KernelBinaryContent(const std::string& blob) : _blob(blob) {
- IE_ASSERT(!_blob.empty());
- }
-
- const void* getRaw() const override {
- IE_ASSERT(desc().totalDimSize() * desc().elemSize() == _blob.length());
- return _blob.data();
- }
-
-private:
- std::string _blob;
-};
-
class CustomStage final : public StageNode {
public:
using StageNode::StageNode;
auto input2 = inputEdge(2)->input();
auto output = outputEdge(0)->output();
- input0->serializeBuffer(serializer, output->desc().dimsOrder());
+ input0->serializeBuffer(serializer);
output->serializeBuffer(serializer);
- input1->serializeBuffer(serializer, output->desc().dimsOrder());
- input2->serializeBuffer(serializer, output->desc().dimsOrder());
+ input1->serializeBuffer(serializer);
+ input2->serializeBuffer(serializer);
}
};
// Copyright (C) 2019-2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
-//
#include <vpu/frontend/frontend.hpp>
// Copyright (C) 2018-2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
-//
#include <vpu/frontend/frontend.hpp>
--- /dev/null
+// Copyright (C) 2019-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <memory>
+
+namespace vpu {
+
+namespace {
+
+class ExpTopKROIsStage final : public StageNode {
+private:
+ StagePtr cloneImpl() const override {
+ return std::make_shared<ExpTopKROIsStage>(*this);
+ }
+
+ void propagateDataOrderImpl(StageDataInfo<DimsOrder>& orderInfo) override {
+ }
+
+ void getDataStridesRequirementsImpl(StageDataInfo<StridesRequirement>& stridesInfo) override {
+ for (const auto& inEdge : inputEdges()) {
+ stridesInfo.setInput(inEdge, StridesRequirement::compact());
+ }
+ for (const auto& outEdge : outputEdges()) {
+ stridesInfo.setOutput(outEdge, StridesRequirement::compact());
+ }
+ }
+
+ void finalizeDataLayoutImpl() override {
+ }
+
+ void getBatchSupportInfoImpl(StageDataInfo<BatchSupport>& batchInfo) override {
+ }
+
+ void initialCheckImpl() const override {
+ assertInputsOutputsTypes(this,
+ {{DataType::FP16}, {DataType::FP16}},
+ {{DataType::FP16}});
+ }
+
+ void serializeParamsImpl(BlobSerializer& serializer) const override {
+ const auto& params = attrs().get<int32_t>("max_rois");
+
+ serializer.append(params);
+ }
+
+ void serializeDataImpl(BlobSerializer& serializer) const override {
+ input(0)->serializeBuffer(serializer);
+ input(1)->serializeBuffer(serializer);
+ output(0)->serializeBuffer(serializer);
+ }
+};
+
+} // namespace
+
+void FrontEnd::parseExpTopKROIs(
+ const Model& model,
+ const ie::CNNLayerPtr& layer,
+ const DataVector& inputs,
+ const DataVector& outputs) const {
+ VPU_THROW_UNLESS(inputs.size() == 2, "Layer %s must have 2 input tensors.", layer->name);
+ VPU_THROW_UNLESS(outputs.size() == 1, "Layer %s must have 1 output tensor.", layer->name);
+
+ int32_t max_rois = layer->GetParamAsInt("max_rois", 0);
+
+ auto inputRois = inputs[0];
+ auto inputProbs = inputs[1];
+ auto outputRois = outputs[0];
+
+ VPU_THROW_UNLESS((inputRois->desc().dims().size() == 2) &&
+ (inputRois->desc().dim(Dim::C) == 4),
+ "Wrong shape for input 0 of layer %s, expected (N, 4), got: dims size = %lu, dim C = %d",
+ layer->name, inputRois->desc().dims().size(), inputRois->desc().dim(Dim::C));
+
+ VPU_THROW_UNLESS(inputProbs->desc().dims().size() == 1,
+ "Wrong shape for input 1 of layer %s, expected dim size = 1, got: %lu",
+ layer->name, inputProbs->desc().dims().size());
+
+ VPU_THROW_UNLESS(inputProbs->desc().dim(Dim::C) == inputRois->desc().dim(Dim::N),
+ "Layer %s: input0 dim N and input1 dim C must be equal, got: input0 (N = %d), input1 (C = %d)",
+ layer->name, inputProbs->desc().dim(Dim::N), inputProbs->desc().dim(Dim::C));
+
+ VPU_THROW_UNLESS((outputRois->desc().dims().size() == 2) &&
+ (outputRois->desc().dim(Dim::C) == 4),
+ "Wrong shape for output 0 of layer %s, expected (N, 4), got: dims size = %lu, dim C = %d",
+ layer->name, outputRois->desc().dims().size(), outputRois->desc().dim(Dim::C));
+
+ VPU_THROW_UNLESS(outputRois->desc().dim(Dim::N) == max_rois,
+ "Wrong shape for output 0 of layer %s, expected dim N = %d, got: dim N = %d",
+ layer->name, static_cast<int>(max_rois), outputRois->desc().dim(Dim::N));
+
+ auto stage = model->addNewStage<ExpTopKROIsStage>(
+ layer->name,
+ StageType::ExpTopKROIs,
+ layer,
+ inputs,
+ outputs);
+
+ stage->attrs().set("max_rois", max_rois);
+}
+
+} // namespace vpu
}
StageSHAVEsRequirements getSHAVEsRequirementsImpl() const override {
- return StageSHAVEsRequirements::OnlyOne;
+ return StageSHAVEsRequirements::NotNeeded;
}
void initialCheckImpl() const override {
#include <vpu/frontend/frontend.hpp>
+#include <vpu/graph_transformer.hpp>
+#include <vpu/compile_env.hpp>
+#include <vpu/utils/file_system.hpp>
+#include <vpu/model/data_contents/mtcnn_blob_content.hpp>
+
+#include <cpp/ie_cnn_net_reader.h>
+
#include <vector>
#include <fstream>
#include <string>
#include <memory>
#include <set>
-#include <cpp/ie_cnn_net_reader.h>
-
-#include <vpu/graph_transformer.hpp>
-#include <vpu/compile_env.hpp>
-#include <vpu/utils/file_system.hpp>
-
namespace vpu {
// Must be synchronized with MvTensor
}
};
-class MTCNNBlobContent final : public DataContent {
-public:
- explicit MTCNNBlobContent(std::vector<char>&& blob) : _blob(std::forward<std::vector<char>>(blob)) {
- IE_ASSERT(!_blob.empty());
- }
-
- const void* getRaw() const override {
- IE_ASSERT(desc().totalDimSize() * desc().elemSize() == _blob.size());
- return _blob.data();
- }
-
-private:
- std::vector<char> _blob;
-};
-
std::pair<int, int> getResolution(const std::string& str) {
std::istringstream stream(str);
std::string output;
auto innerGraphsDesc = DataDesc({mergedBlob.size()});
innerGraphsDesc.setType(DataType::U8);
- auto innerGraphs = model->addConstData(layer->name + "@innerGraphs", innerGraphsDesc, std::make_shared<MTCNNBlobContent>(std::move(mergedBlob)));
+ auto innerGraphs = model->addConstData(layer->name + "@innerGraphs", innerGraphsDesc, std::make_shared<MTCNNBlobContent>(mergedBlob));
auto stage = model->addNewStage<MTCNNStage>(layer->name, StageType::MTCNN, layer, {input, innerGraphs}, {output});
stage->attrs().set("pyramid", pyramid);
const auto input = inputEdge(0)->input();
const auto output = outputEdge(0)->output();
- IE_ASSERT(input->memoryOffset() % 16 == 0);
- IE_ASSERT(output->memoryOffset() % 16 == 0);
+ IE_ASSERT(input->dataLocation().offset % 16 == 0);
+ IE_ASSERT(output->dataLocation().offset % 16 == 0);
if (attrs().get<HwOpType>("hwOpType") != HwOpType::POOL) {
const auto weights = inputEdge(1)->input();
const auto biases = inputEdge(2)->input();
const auto scales = inputEdge(3)->input();
- IE_ASSERT(weights->memoryOffset() % 16 == 0);
- IE_ASSERT(biases->memoryOffset() % 16 == 0);
- IE_ASSERT(scales->memoryOffset() % 16 == 0);
+ IE_ASSERT(weights->dataLocation().offset % 16 == 0);
+ IE_ASSERT(biases->dataLocation().offset % 16 == 0);
+ IE_ASSERT(scales->dataLocation().offset % 16 == 0);
}
}
--- /dev/null
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+#include <precision_utils.h>
+#include <memory>
+#include <set>
+
+namespace vpu {
+
+namespace {
+
+class NonZero : public StageNode {
+private:
+ StagePtr cloneImpl() const override {
+ return std::make_shared<NonZero>(*this);
+ }
+
+ void propagateDataOrderImpl(StageDataInfo<DimsOrder>& orderInfo) override {
+ }
+
+ void getDataStridesRequirementsImpl(StageDataInfo<StridesRequirement>& stridesInfo) override {
+ auto inputStrides = input(0)->requiredStrides();
+ auto outIndicesStrides = output(0)->requiredStrides();
+ auto outDimsStrides = output(1)->requiredStrides();
+
+ stridesInfo.setInput(inputEdge(0), inputStrides.add(0, DimStride::Compact));
+ stridesInfo.setOutput(outputEdge(0), outIndicesStrides.add(0, DimStride::Compact));
+ stridesInfo.setOutput(outputEdge(1), outDimsStrides.add(0, DimStride::Compact));
+ }
+
+ void finalizeDataLayoutImpl() override {
+ }
+
+ void getBatchSupportInfoImpl(StageDataInfo<BatchSupport>& batchInfo) override {
+ }
+
+ void initialCheckImpl() const override {
+ assertInputsOutputsTypes(this,
+ {{DataType::FP16, DataType::U8, DataType::S32}},
+ {{DataType::S32}, {DataType::S32}});
+ }
+
+ void finalCheckImpl() const override {
+ }
+
+ void serializeParamsImpl(BlobSerializer& serializer) const override {
+ }
+
+ void serializeDataImpl(BlobSerializer& serializer) const override {
+ VPU_INTERNAL_CHECK(numInputs() == 1,
+ "Nonzero stage with name %s must have only 1 input, "
+ "actually provided %d", name(), numInputs());
+ VPU_INTERNAL_CHECK(numOutputs() == 2,
+ "Nonzero stage with name %s must have only 2 outputs, "
+ "actually provided %d", name(), numOutputs());
+
+ input(0)->serializeBuffer(serializer);
+ output(0)->serializeBuffer(serializer);
+ output(1)->serializeBuffer(serializer);
+ }
+};
+
+} // namespace
+
+void FrontEnd::parseNonZero(
+ const Model& model,
+ const ie::CNNLayerPtr& layer,
+ const DataVector& inputs,
+ const DataVector& outputs) const {
+ VPU_THROW_UNLESS(inputs.size() == 1,
+ "Nonzero layer with name %s must have only 1 input, actually provided %d",
+ layer->name, inputs.size());
+ VPU_THROW_UNLESS(outputs.size() == 2,
+ "Nonzero layer with name %s must have only 2 outputs, actually provided %d",
+ layer->name, outputs.size());
+
+ const auto input = inputs[0];
+ const auto inputNumDims = input->desc().numDims();
+ const auto totalIndicesDimSize = input->desc().totalDimSize();
+
+ const auto outIndicesDesc = outputs[0]->desc();
+ const auto outIndicesPerm = outIndicesDesc.dimsOrder().toPermutation();
+ const auto minorIndicesDim = outIndicesDesc.dim(outIndicesPerm.at(0));
+ const auto majorIndicesDim = outIndicesDesc.dim(outIndicesPerm.at(1));
+ VPU_THROW_UNLESS(outIndicesDesc.numDims() == 2,
+ "NonZero layer with name %s must have 2D output Indices tensor, "
+ "actually provided %dD tensor",
+ layer->name, outIndicesDesc.numDims());
+ VPU_THROW_UNLESS(minorIndicesDim >= totalIndicesDimSize,
+ "NonZero layer with name %s must have output Indices tensor with minor dim "
+ "size >= total amount of elements of input tensor, actually provided %d >= %d",
+ layer->name, minorIndicesDim, totalIndicesDimSize);
+ VPU_THROW_UNLESS(majorIndicesDim == inputNumDims,
+ "NonZero layer with name %s must have output Indices tensor with major dim "
+ "size == number of dimensions of input tensor, actually provided %d == %d",
+ layer->name, majorIndicesDim, inputNumDims);
+
+ const auto outDimsDesc = outputs[1]->desc();
+ const auto outDimsPerm = outDimsDesc.dimsOrder().toPermutation();
+ const auto minorDimsDim = outDimsDesc.dim(outDimsPerm.at(0));
+ VPU_THROW_UNLESS(outDimsDesc.numDims() == 1,
+ "NonZero layer with name %s must have 1D output Dims tensor, "
+ "actually provided %dD tensor",
+ layer->name, outDimsDesc.numDims());
+ VPU_THROW_UNLESS(minorDimsDim >= 2,
+ "NonZero layer with name %s must have output Dims tensor with minor dim "
+ "size >= 2, actually provided %d",
+ layer->name, minorDimsDim);
+
+ model->addNewStage<NonZero>(
+ layer->name,
+ StageType::NonZero,
+ layer,
+ inputs,
+ outputs);
+}
+
+} // namespace vpu
#include <vpu/frontend/frontend.hpp>
+#include <vpu/model/data_contents/ie_blob_content.hpp>
+
#include <vector>
#include <map>
#include <unordered_set>
#include <vpu/frontend/frontend.hpp>
+#include <vpu/stages/post_op_stage.hpp>
+#include <vpu/utils/ie_helpers.hpp>
+#include <vpu/utils/profiling.hpp>
+#include <vpu/model/data_contents/prelu_blob_content.hpp>
+
+#include <ie_parallel.hpp>
+
#include <vector>
#include <memory>
-#include <vpu/stages/post_op_stage.hpp>
-
namespace vpu {
namespace {
auto weights = model->addConstData(
layer->name + "@weights",
DataDesc({output->desc().dim(Dim::C)}),
- ieBlobContent(weightsBlob, channelShared ? output->desc().dim(Dim::C) : 1));
+ std::make_shared<PReLUBlobContent>(weightsBlob, DataDesc({output->desc().dim(Dim::C)}),
+ channelShared ? output->desc().dim(Dim::C) : 1));
model->addNewStage<PReluStage>(layer->name, StageType::PRelu, layer, {inputs[0], weights}, outputs);
}
#include <vpu/frontend/frontend.hpp>
+#include <vpu/model/data_contents/ie_blob_content.hpp>
+
#include <algorithm>
#include <memory>
#include <set>
input1,
"",
DataDesc(),
- ieBlobContent(newIndicesBlob));
+ ieBlobContent(newIndicesBlob, DataType::S32));
model()->replaceStageInput(inputEdge(1), newList);
}
#include <vpu/frontend/frontend.hpp>
+#include <vpu/utils/numeric.hpp>
+#include <vpu/model/data_contents/ie_blob_content.hpp>
+
#include <vector>
#include <string>
#include <memory>
#include <set>
-#include <vpu/utils/numeric.hpp>
-
namespace vpu {
namespace {
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <string>
+#include <unordered_set>
+#include <memory>
+#include <set>
+
+namespace vpu {
+
+VPU_DECLARE_ENUM(ROIAlignMode,
+ Average = 0,
+ Max = 1
+)
+
+static const std::string s_mode = "mode";
+static const std::string s_pooled_w = "pooled_w";
+static const std::string s_pooled_h = "pooled_h";
+static const std::string s_sampling_ratio = "sampling_ratio";
+static const std::string s_spatial_scale = "spatial_scale";
+
+namespace {
+
+class ROIAlignStage final : public StageNode {
+private:
+ StagePtr cloneImpl() const override {
+ return std::make_shared<ROIAlignStage>(*this);
+ }
+
+ void propagateDataOrderImpl(StageDataInfo<DimsOrder>& orderInfo) override {
+ orderInfo.setInput(inputEdge(0), inputEdge(0)->input()->desc().dimsOrder().createMovedDim(Dim::C, 2));
+ orderInfo.setOutput(outputEdge(0), outputEdge(0)->output()->desc().dimsOrder().createMovedDim(Dim::C, 2));
+ }
+
+ void getDataStridesRequirementsImpl(StageDataInfo<StridesRequirement>& stridesInfo) override {
+ for (const auto& inEdge : inputEdges()) {
+ stridesInfo.setInput(inEdge, StridesRequirement::compact());
+ }
+ for (const auto& outEdge : outputEdges()) {
+ stridesInfo.setOutput(outEdge, StridesRequirement::compact());
+ }
+ }
+
+ void finalizeDataLayoutImpl() override {
+ }
+
+ void getBatchSupportInfoImpl(StageDataInfo<BatchSupport>& batchInfo) override {
+ }
+
+ void initialCheckImpl() const override {
+ assertInputsOutputsTypes(this, {{DataType::FP16}, {DataType::FP16}, {DataType::S32}}, {{DataType::FP16}});
+ }
+
+ void serializeParamsImpl(BlobSerializer& serializer) const override {
+ const auto pooled_w = attrs().get<int>(s_pooled_w);
+ const auto pooled_h = attrs().get<int>(s_pooled_h);
+ const auto sampling_ratio = attrs().get<int>(s_sampling_ratio);
+ const auto spatial_scale = attrs().get<float>(s_spatial_scale);
+ const auto mode = attrs().get<ROIAlignMode>(s_mode);
+
+ serializer.append(static_cast<uint32_t>(pooled_w));
+ serializer.append(static_cast<uint32_t>(pooled_h));
+ serializer.append(static_cast<uint32_t>(sampling_ratio));
+ serializer.append(static_cast<float>(spatial_scale));
+ serializer.append(static_cast<ROIAlignMode>(mode));
+ }
+
+ void serializeDataImpl(BlobSerializer& serializer) const override {
+ for (int i = 0; i < numInputs(); i++) {
+ inputEdge(i)->input()->serializeBuffer(serializer);
+ }
+
+ outputEdge(0)->output()->serializeBuffer(serializer);
+ }
+};
+
+} // namespace
+
+void FrontEnd::parseROIAlign(const Model& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs) const {
+ VPU_THROW_UNLESS(inputs.size() == 3,
+ "ROIAlign stage with name {} has invalid number of inputs: expected 3, "
+ "actually provided {}", layer->name, inputs.size());
+
+ VPU_THROW_UNLESS(outputs.size() == 1,
+ "ROIAlign stage with name {} has invalid number of outputs: expected 1, "
+ "actually provided {}", layer->name, outputs.size());
+
+ const auto stage = model->addNewStage<ROIAlignStage>(layer->name, StageType::ROIAlign, layer, inputs, outputs);
+ const auto mode = layer->GetParamAsString("mode", "");
+
+ if (mode == "avg") {
+ stage->attrs().set<ROIAlignMode>(s_mode, ROIAlignMode::Average);
+ } else if (mode == "max") {
+ stage->attrs().set<ROIAlignMode>(s_mode, ROIAlignMode::Max);
+ } else {
+ VPU_THROW_FORMAT("Layer with name {} supports only (avg, max) mode", layer->name);
+ }
+
+ stage->attrs().set<int>(s_pooled_w, layer->GetParamAsInt("pooled_w"));
+ stage->attrs().set<int>(s_pooled_h, layer->GetParamAsInt("pooled_h"));
+ stage->attrs().set<int>(s_sampling_ratio, layer->GetParamAsInt("sampling_ratio"));
+ stage->attrs().set<float>(s_spatial_scale, layer->GetParamAsFloat("spatial_scale"));
+}
+
+} // namespace vpu
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <memory>
+#include <string>
+
+namespace vpu {
+
+using InferenceEngine::CNNLayerPtr;
+
+//----------------------------------------------------------------------
+
+namespace {
+
+class ScatterUpdateStage final : public StageNode {
+public:
+ using StageNode::StageNode;
+
+private:
+ StagePtr cloneImpl() const override {
+ return std::make_shared<ScatterUpdateStage>(*this);
+ }
+
+ void propagateDataOrderImpl(StageDataInfo<DimsOrder>& orderInfo) override {
+ const auto data = inputEdge(0)->input();
+ const auto indices = inputEdge(1)->input();
+ const auto updates = inputEdge(2)->input();
+ const auto axis = inputEdge(3)->input();
+ const auto output = outputEdge(0)->output();
+ orderInfo.setInput(inputEdge(0), DimsOrder::fromNumDims(data->desc().numDims()));
+ orderInfo.setInput(inputEdge(1), DimsOrder::fromNumDims(indices->desc().numDims()));
+ orderInfo.setInput(inputEdge(2), DimsOrder::fromNumDims(updates->desc().numDims()));
+ orderInfo.setInput(inputEdge(3), DimsOrder::fromNumDims(axis->desc().numDims()));
+ orderInfo.setOutput(outputEdge(0), DimsOrder::fromNumDims(output->desc().numDims()));
+ }
+
+ void getDataStridesRequirementsImpl(StageDataInfo<StridesRequirement>& stridesInfo) override {
+ stridesInfo.setInput(inputEdge(0), StridesRequirement::compact()); // `data` tensor
+ stridesInfo.setInput(inputEdge(2), StridesRequirement::compact()); // `updates` tensor
+ stridesInfo.setOutput(outputEdge(0), StridesRequirement::compact()); // `output` tensor
+ }
+
+ void finalizeDataLayoutImpl() override {
+ }
+
+ void getBatchSupportInfoImpl(StageDataInfo<BatchSupport>& /*batchInfo*/) override {
+ }
+
+ StageSHAVEsRequirements getSHAVEsRequirementsImpl() const override {
+ return StageSHAVEsRequirements::NotNeeded;
+ }
+
+ void initialCheckImpl() const override {
+ assertInputsOutputsTypes(this,
+ // `data` , `indices` , `updates` , `axis` tensor
+ {{DataType::FP16}, {DataType::S32}, {DataType::FP16}, {DataType::S32}},
+ {{DataType::FP16}});
+ }
+
+ void serializeDataImpl(BlobSerializer& serializer) const override {
+ auto data = input(0);
+ auto indices = input(1);
+ auto updates = input(2);
+ auto axis = input(3);
+ auto out = output(0);
+
+ data->serializeBuffer(serializer);
+ out->serializeBuffer(serializer);
+ indices->serializeBuffer(serializer);
+ updates->serializeBuffer(serializer);
+ axis->serializeBuffer(serializer);
+ }
+
+ void serializeParamsImpl(BlobSerializer& serializer) const override {
+ }
+};
+
+} // namespace
+
+//----------------------------------------------------------------------
+
+static
+void checkTensorShapes(const vpu::Data& input,
+ const vpu::Data& output,
+ const vpu::Data& indices,
+ const vpu::Data& updates,
+ const vpu::Data& axis) {
+ const DataDesc& inputDesc = input->desc();
+ const DataDesc& outputDesc = output->desc();
+ const DataDesc& indicesDesc = indices->desc();
+ const DataDesc& updatesDesc = updates->desc();
+ const DataDesc& axisDesc = axis->desc();
+
+ const auto inputType = inputDesc.type();
+ const auto outputType = outputDesc.type();
+ const auto indicesType = indicesDesc.type();
+ const auto updatesType = updatesDesc.type();
+ const auto axisType = axisDesc.type();
+
+ VPU_THROW_UNLESS(inputType == DataType::FP16, "input type is invalid");
+ VPU_THROW_UNLESS(outputType == DataType::FP16, "output type is invalid");
+ VPU_THROW_UNLESS(indicesType == DataType::S32, "indices type is invalid");
+ VPU_THROW_UNLESS(updatesType == DataType::FP16, "updates type is invalid");
+ VPU_THROW_UNLESS(axisType == DataType::S32, "axis type is invalid");
+
+ const int inputNDims = inputDesc.numDims();
+ const int outputNDims = outputDesc.numDims();
+ const int indicesNDims = indicesDesc.numDims();
+ const int updatesNDims = updatesDesc.numDims();
+ const int axisNDims = axisDesc.numDims();
+
+ VPU_THROW_UNLESS(inputNDims > 0, "input tensor must not be 0-dimensional");
+ VPU_THROW_UNLESS(outputNDims > 0, "output tensor must not be 0-dimensional");
+ VPU_THROW_UNLESS(indicesNDims > 0, "indices tensor must not be 0-dimensional");
+ VPU_THROW_UNLESS(updatesNDims > 0, "updates tensor must not be 0-dimensional");
+ VPU_THROW_UNLESS(axisNDims > 0, "axis tensor must not be 0-dimensional");
+
+ VPU_THROW_UNLESS(inputNDims == outputNDims,
+ "input and output have different shapes: inputNDims={}, outputNDims={}",
+ inputNDims, outputNDims);
+
+ VPU_THROW_UNLESS(updatesNDims == indicesNDims + outputNDims - 1,
+ "incompatible shapes: indicesNDims=%d, updatesNDims={}, outputNDims={}",
+ indicesNDims, updatesNDims, outputNDims);
+
+ VPU_THROW_UNLESS(axisNDims == 1,
+ "axis tensor must be 1-dimensional, but axisNDims={}",
+ axisNDims);
+
+ const DimsOrder inputDimsOrder = inputDesc.dimsOrder();
+ const DimsOrder outputDimsOrder = outputDesc.dimsOrder();
+ const DimsOrder indicesDimsOrder = indicesDesc.dimsOrder();
+ const DimsOrder updatesDimsOrder = updatesDesc.dimsOrder();
+ const DimsOrder axisDimsOrder = axisDesc.dimsOrder();
+
+ VPU_THROW_UNLESS(inputDimsOrder == outputDimsOrder, "input/output must have same layout"
+ ", but inputDimsOrder = \"{}\", and outputDimsOrder = \"{}\"",
+ inputDimsOrder, outputDimsOrder);
+
+ // Check if tensor shapes fit each other, e.g.:
+ // {N, C, H, W} could be shape of `input` and `output`
+ // {I, J, C, H, W} could be shape of `update` tensor
+ // {I, J} could be shape of `indices`
+
+ const DimValues& inputDims = inputDesc.dims();
+ const DimValues& outputDims = outputDesc.dims();
+ const DimValues& indicesDims = indicesDesc.dims();
+ const DimValues& updatesDims = updatesDesc.dims();
+ const DimValues& axisDims = axisDesc.dims();
+
+ VPU_THROW_UNLESS(inputDims == outputDims, "input/output tensors must have same lengths"
+ ", but inputDims = \"{}\", and outputDims = \"{}\"", inputDims, outputDims);
+
+ // Permutation is array of dims, from minor to major
+ const DimVector inputPerm = inputDimsOrder.toPermutation();
+ const DimVector indicesPerm = indicesDimsOrder.toPermutation();
+ const DimVector updatesPerm = updatesDimsOrder.toPermutation();
+
+ // Check if the updates fits the input, e.g.:
+ // {N, C, H, W} could be shape of `input` and `output`
+ // {I, J, C, H, W} could be shape of `update` tensor
+ for (int i = 0; i < inputNDims - 1; i++) {
+ const Dim inputDim = inputPerm[i];
+ const Dim updatesDim = updatesPerm[i];
+ const int inputSize = inputDims[inputDim];
+ const int updatesSize = updatesDims[updatesDim];
+ VPU_THROW_UNLESS(inputSize == updatesSize,
+ "updates size must fit input along corresponding axes, "
+ "but for axis={}: input size={}, updates size={}",
+ i, inputSize, updatesSize);
+ }
+
+ // Check if the updates fits the indices, e.g.:
+ // {I, J, C, H, W} could be shape of `update` tensor
+ // {I, J} could be shape of `indices`
+ for (int i = inputNDims - 1; i < updatesNDims; i++) {
+ const int i0 = i - (inputNDims - 1);
+ const Dim indicesDim = indicesPerm[i0];
+ const Dim updatesDim = updatesPerm[i];
+ const int indicesSize = indicesDims[indicesDim];
+ const int updatesSize = updatesDims[updatesDim];
+ VPU_THROW_UNLESS(indicesSize == updatesSize,
+ "updates size must fit indices along corresponding axes, "
+ "but for axis={}: indices size={}, updates size={}",
+ i, indicesSize, updatesSize);
+ }
+
+ // Note, that for a 1D tensor the layout is "C"
+ VPU_THROW_UNLESS(axisDimsOrder == DimsOrder::C,
+ "axis must be 1D tensor, but its dims order is {}",
+ axisDimsOrder);
+ VPU_THROW_UNLESS(axisDims[Dim::C] == 1,
+ "axis tensor must be 1D array of 1 element, but axis length = %d",
+ axisDims[Dim::C]);
+}
+
+void FrontEnd::parseScatterUpdate(const Model & model,
+ const CNNLayerPtr& layer,
+ const DataVector & inputs,
+ const DataVector & outputs) const {
+ VPU_THROW_UNLESS(inputs.size() == 4, "invalid number of inputs: %lu", inputs.size());
+ VPU_THROW_UNLESS(outputs.size() == 1, "invalid number of outputs: %lu", outputs.size());
+
+ const auto& input = inputs[0]; // `data` tensor
+ const auto& indices = inputs[1];
+ const auto& updates = inputs[2];
+ const auto& axis = inputs[3];
+ const auto& output = outputs[0];
+
+ checkTensorShapes(input, output, indices, updates, axis);
+
+ auto scatterUpdateLayer = std::dynamic_pointer_cast<ie::ScatterUpdateLayer>(layer);
+
+ VPU_THROW_UNLESS(scatterUpdateLayer != nullptr,
+ "this layer is not an instance of ScatterUpdateLayer: "
+ "layer name = \"%s\", layer type = \"%s\"",
+ layer->name.c_str(), layer->type.c_str());
+
+ auto stage = model->addNewStage<ScatterUpdateStage>(layer->name,
+ StageType::ScatterUpdate,
+ layer,
+ {input, indices, updates, axis},
+ {output});
+
+ VPU_THROW_UNLESS(stage != nullptr,
+ "failed to create ScatterUpdateStage: "
+ "layer name = \"%s\", layer type = \"%s\"",
+ layer->name.c_str(), layer->type.c_str());
+}
+
+//----------------------------------------------------------------------
+
+Stage StageBuilder::addScatterUpdateStage(
+ const Model& model,
+ const std::string& name,
+ const ie::CNNLayerPtr& layer,
+ const Data& input,
+ const Data& output,
+ const Data& indices,
+ const Data& updates,
+ const Data& axis) {
+ checkTensorShapes(input, output, indices, updates, axis);
+
+ auto stage = model->addNewStage<ScatterUpdateStage>(name,
+ StageType::ScatterUpdate,
+ layer,
+ {input, indices, updates, axis},
+ {output});
+
+ VPU_THROW_UNLESS(stage != nullptr,
+ "failed to create ScatterUpdateStage: "
+ "layer name = \"%s\", layer type = \"%s\"",
+ layer->name.c_str(), layer->type.c_str());
+
+ return stage;
+}
+
+} // namespace vpu
auto inputBiases = inputEdge(2)->input();
auto output = outputEdge(0)->output();
- input->serializeBuffer(serializer, output->desc().dimsOrder());
+ input->serializeBuffer(serializer);
output->serializeBuffer(serializer);
- inputScales->serializeBuffer(serializer, output->desc().dimsOrder());
- inputBiases->serializeBuffer(serializer, output->desc().dimsOrder());
+ inputScales->serializeBuffer(serializer);
+ inputBiases->serializeBuffer(serializer);
}
};
#include "vpu/utils/auto_scope.hpp"
#include "vpu/compile_env.hpp"
#include "graph_transformer.h"
+#include "vpu/model/data_contents/ie_blob_content.hpp"
#include "ie_layers_internal.hpp"
#include "net_pass.h"
VPU_THROW_UNLESS(isConst(original), "VPU const data object can be created only from const IE data object");
const auto& creator = original->getCreatorLayer().lock();
- const auto& blob = ieBlobContent(creator->blobs.begin()->second);
const auto& descriptor = createDescriptor(original->getTensorDesc());
+ const auto& blob = ieBlobContent(creator->blobs.begin()->second, descriptor.type());
return model->addConstData(original->getName(), descriptor, blob);
};
# "mvnc" must be the first library in the link list
target_link_libraries(${TARGET_NAME}
PRIVATE
- mvnc ${INTEL_ITT_LIBS} ${NGRAPH_LIBRARIES} inference_engine vpu_graph_transformer)
+ mvnc ${INTEL_ITT_LIBS} inference_engine vpu_graph_transformer)
# install
void ExecutableNetwork::Import(std::istream& strm,
std::vector<DevicePtr> &devicePool,
const MyriadConfig& config) {
- std::ostringstream blobContentStream;
- blobContentStream << strm.rdbuf();
- const std::string& blobContentString = blobContentStream.str();
- std::copy(blobContentString.begin(), blobContentString.end(), std::back_inserter(_graphBlob));
+ auto currentPos = strm.tellg();
+ strm.seekg(0, strm.end);
+ auto blobSize = strm.tellg() - currentPos;
+ _graphBlob.resize(static_cast<size_t>(blobSize));
+ strm.seekg(currentPos, strm.beg);
+ strm.read(&_graphBlob[0], blobSize);
if (!_device->isBooted()) {
return;
#include <utility>
#include <ie_metric_helpers.hpp>
-#include <cnn_network_ngraph_impl.hpp>
+#include <cpp/ie_cnn_network.h>
#include <cpp_interfaces/base/ie_plugin_base.hpp>
#include <cpp_interfaces/impl/ie_executable_network_internal.hpp>
#include <vpu/parsed_config.hpp>
#include <vpu/utils/profiling.hpp>
#include <vpu/utils/error.hpp>
+#include <vpu/ngraph/transformations/dynamic_to_static_shape.hpp>
+#include <generic_ie.hpp>
#include "myriad_plugin.h"
auto parsedConfigCopy = _parsedConfig;
parsedConfigCopy.update(config);
- std::shared_ptr<ICNNNetwork> clonedNetwork(nullptr);
-
- if (auto networkNGraph = dynamic_cast<const CNNNetworkNGraphImpl*>(&network)) {
- clonedNetwork = networkNGraph->cloneNGraphImpl();
- } else {
- clonedNetwork = cloneNet(network);
+ std::shared_ptr<ICNNNetwork> clonedNetwork = cloneNetwork(network);
+ if (auto func = clonedNetwork->getFunction()) {
+ ngraph::op::GenericIE::DisableReshape noReshape(func);
+ ngraph::pass::DynamicToStaticShape().run_on_function(func);
}
return std::make_shared<ExecutableNetwork>(*clonedNetwork, _devicePool, parsedConfigCopy);
auto parsedConfigCopy = _parsedConfig;
parsedConfigCopy.update(config);
+ const auto deviceName = parsedConfigCopy.deviceName();
+ if (!deviceName.empty()) {
+ const auto deviceIDs = GetMetric(METRIC_KEY(AVAILABLE_DEVICES), {}).as<std::vector<std::string>>();
+ VPU_THROW_UNLESS(!(std::find(deviceIDs.begin(), deviceIDs.end(), deviceName) == deviceIDs.end()), "Myriad device: {} not found.", deviceName);
+ }
+
const auto log = std::make_shared<Logger>(
"GraphCompiler",
parsedConfigCopy.logLevel(),
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-
#include <extension.hpp>
#include <ngraph/opsets/opset.hpp>
#include <ngraph/factory.hpp>
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <gtest/gtest.h>
#include <cpp/ie_cnn_network.h>
-#include <cnn_network_ngraph_impl.hpp>
#include <string>
#include <sstream>
#include <fstream>
auto ngraph_function = std::make_shared<ngraph::Function>(ngraph::ResultVector{output},
ngraph::ParameterVector{inp});
- InferenceEngine::details::CNNNetworkNGraphImpl cnn(ngraph_function);
- auto icnn = cnn.getCNNNetwork();
+ CNNNetwork cnn(ngraph_function);
+ cnn.begin();
std::map<std::string, InferenceEngine::SizeVector> inShape;
inShape["test"] = {1, 3, 4, 5};
- icnn->reshape(inShape, nullptr);
+ cnn.reshape(inShape);
}
TEST_F(NGraphReshapeTests, genericNodeWithDynShape) {
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <map>
+#include <functional>
+#include <utility>
+
+#include <ie_core.hpp>
+#include <ie_plugin_config.hpp>
+
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class BF16NetworkRestore1 : public BasicBF16Test {
+protected:
+ std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+ // + Power1(FP32)
+ // |
+ // + AvgPooling1(FP32)
+ // |
+ // + Convolution1(BF16)
+ // |
+ // + ReLU1(Fused)
+ // |------------------------
+ // | \
+ // + Convolution2(BF16) Convolution 3 (BF16)
+ // | / \
+ // + | ReLU2(FP32) Normalize (FP32)
+ // \ / |
+ // Eltwise (Fused to Conv2) ------/
+ // | /
+ // ReLU3 (Fused to Conv2) /
+ // | /
+ // MaxPooling1 (FP32) /
+ // \ /
+ // Eltwise
+ // |
+
+
+ // STAGE1: construction of the GRAPH
+
+ ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+ // multiply
+ auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 224, 224});
+ input1->set_friendly_name("Input_1");
+ std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+ } else {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+ }
+ auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+ // add
+ std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+ } else {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+ }
+ auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+ addNode->set_friendly_name("Power1");
+
+ // AvgPooling
+ auto avgpoolNode = std::make_shared<opset1::AvgPool>(addNode,
+ Strides{1, 1},
+ Shape{1, 1},
+ Shape{1, 1},
+ Shape{2, 2},
+ true,
+ op::RoundingType::FLOOR);
+ avgpoolNode->set_friendly_name("AvgPooling1");
+
+ // convolution1
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+ ngraph::Shape convFilterShape = { 3, 3, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValuesFP32;
+ weightValuesFP32.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+ } else {
+ std::vector<short> weightValuesBF16;
+ weightValuesBF16.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+ avgpoolNode, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 0, 0 }), // pad begin
+ ngraph::CoordinateDiff({ 0, 0 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode1->set_friendly_name("Convolution1");
+
+ // ReLU1
+ auto reluNode = std::make_shared<opset1::Relu>(convNode1);
+ reluNode->set_friendly_name("ReLU1");
+
+ // convolution2
+ std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+ reluNode, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 0, 0 }), // pad begin
+ ngraph::CoordinateDiff({ 0, 0 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode2->set_friendly_name("Convolution2");
+
+ // convolution3
+ std::shared_ptr<ngraph::Node> convNode3 = std::make_shared<ngraph::opset1::Convolution>(
+ reluNode, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 0, 0 }), // pad begin
+ ngraph::CoordinateDiff({ 0, 0 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode3->set_friendly_name("Convolution3");
+
+ // ReLU1
+ auto reluNode2 = std::make_shared<opset1::Relu>(convNode3);
+ reluNode2->set_friendly_name("ReLU2");
+
+ // Norm1
+ // normalize
+ const auto axes = make_shared<op::Constant>(element::i64, Shape{2}, vector<int64_t>{2});
+ float eps{1e-6f};
+ auto eps_mode = op::EpsMode::ADD;
+
+ auto normNode = std::make_shared<opset1::NormalizeL2>(convNode3, axes, eps, eps_mode);
+ normNode->set_friendly_name("Norm1");
+
+
+
+ // Eltwise1
+ auto eltNode1 = std::make_shared<opset1::Add>(convNode2, reluNode2);
+ eltNode1->set_friendly_name("Eltwise1");
+
+ // ReLU3
+ auto reluNode3 = std::make_shared<opset1::Relu>(eltNode1);
+ reluNode3->set_friendly_name("ReLU3");
+
+ // maxPooling1
+ auto maxPoolNode = std::make_shared<opset1::MaxPool>(reluNode3,
+ Strides{1, 1},
+ Shape{1, 1},
+ Shape{0, 0},
+ Shape{2, 2},
+ op::RoundingType::FLOOR);
+ maxPoolNode->set_friendly_name("maxPooling1");
+
+ // Eltwise2
+ auto eltNode2 = std::make_shared<opset1::Add>(maxPoolNode, normNode);
+ eltNode2->set_friendly_name("Eltwise2");
+
+ return std::make_shared<ngraph::Function>(ngraph::NodeVector{eltNode2}, ngraph::ParameterVector{input1});
+ }
+ void SetUp()override {
+ std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+ fnPtr = createGraph(netPrecision);
+
+ threshold = 0.4f; // max value in the latest tensor for FP32 network is 10.83
+
+ // STAGE2:
+ // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+ // performance counters
+ expectedPrecisions["Power1"] = "FP32";
+ expectedPrecisions["AvgPooling1"] = "FP32";
+ expectedPrecisions["Convolution1"] = "BF16";
+ expectedPrecisions["ReLU1"] = "ndef";
+ expectedPrecisions["Convolution2"] = "BF16";
+ expectedPrecisions["Convolution3"] = "BF16";
+ expectedPrecisions["ReLU2"] = "FP32";
+ expectedPrecisions["Norm1"] = "FP32";
+ expectedPrecisions["Eltwise1"] = "ndef";
+ expectedPrecisions["ReLU3"] = "ndef";
+ expectedPrecisions["maxPooling1"] = "FP32";
+ expectedPrecisions["Eltwise2"] = "FP32";
+ }
+};
+
+TEST_P(BF16NetworkRestore1, CompareWithRefImpl) {
+ test();
+};
+
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, BF16NetworkRestore1,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::BF16),
+ ::testing::Values(SizeVector({ 1, 3, 224, 224 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ BF16NetworkRestore1::getTestCaseName);
+
+} // namespace LayerTestsDefinitions
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <inference_engine.hpp>
+#include "ie_common.h"
+#include <ie_blob.h>
+#include <math.h>
+#include <map>
+#include <string>
+#include <utility>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "ngraph/opsets/opset1.hpp"
+#include "functional_test_utils/layer_test_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+#include "functional_test_utils/blob_utils.hpp"
+#include <ie_system_conf.h>
+
+namespace LayerTestsDefinitions {
+
+/**
+ * class providing static helpers for bfloat16 functional tests
+ * using functions you can fill the tensor content by some periodic law or compare
+ *
+ */
+class BFloat16Helpers {
+public:
+ static void fillInputsBySinValues(float* data, size_t size) {
+ for (size_t i = 0; i < size; i++) {
+ data[i] = sin(static_cast<float>(i));
+ }
+ }
+
+ static void fillInputsBySinValues(short *data, size_t size) {
+ for (size_t i = 0; i < size; i++) {
+ data[i] = reducePrecisionBitwiseS(sin(static_cast<float>(i)));
+ }
+ }
+
+ static void fillInputsByCosValues(float* data, size_t size) {
+ for (size_t i = 0; i < size; i++) {
+ data[i] = cos(static_cast<float>(i));
+ }
+ }
+
+ static int fillInputsBySinValues(InferenceEngine::Blob::Ptr blob) {
+ InferenceEngine::MemoryBlob::Ptr mblob = InferenceEngine::as<InferenceEngine::MemoryBlob>(blob);
+ if (!mblob) {
+ return -1;
+ }
+ if (mblob->getTensorDesc().getPrecision() != InferenceEngine::Precision::FP32) {
+ return -2;
+ }
+ auto lm = mblob->rwmap();
+ fillInputsBySinValues(lm.as<float*>(), mblob->size());
+ return 0;
+ }
+
+ static std::pair<std::string, std::string> matchPerfCountPrecisionVsExpected(
+ const std::map<std::string, InferenceEngine::InferenceEngineProfileInfo>& perfCounts,
+ const std::map<std::string, std::string>& expected) {
+ for (auto e : expected) {
+ auto it = perfCounts.find(e.first);
+ if (it == perfCounts.end()) {
+ return std::pair<std::string, std::string>(e.first, "NOT_FOUND_IN_PERF_COUNTS");
+ }
+ // get the latest n symbols by number of e.second
+ std::string execType = it->second.exec_type;
+ std::string pfPrecision = execType.substr(execType.length() - e.second.length(), e.second.length());
+ if (pfPrecision != e.second) {
+ return std::pair<std::string, std::string>(e.first, pfPrecision);
+ }
+ }
+ return std::pair<std::string, std::string>("", "");
+ }
+
+ static float getMaxAbsValue(const float* data, size_t size) {
+ float maxVal = 0.f;
+ for (size_t i = 0; i < size; i++) {
+ if (fabs(data[i] > maxVal)) {
+ maxVal = fabs(data[i]);
+ }
+ }
+ return maxVal;
+ }
+
+ static float reducePrecisionBitwise(const float in) {
+ float f = in;
+ int* i = reinterpret_cast<int*>(&f);
+ int t2 = *i & 0xFFFF0000;
+ float ft1 = *(reinterpret_cast<float*>(&t2));
+ if ((*i & 0x8000) && (*i & 0x007F0000) != 0x007F0000) {
+ t2 += 0x10000;
+ ft1 = *(reinterpret_cast<float*>(&t2));
+ }
+ return ft1;
+ }
+
+ static short reducePrecisionBitwiseS(const float in) {
+ float f = reducePrecisionBitwise(in);
+ int intf = *reinterpret_cast<int*>(&f);
+ intf = intf >> 16;
+ short s = intf;
+ return s;
+ }
+};
+
+
+typedef std::tuple<
+ InferenceEngine::Precision,
+ InferenceEngine::Precision,
+ InferenceEngine::SizeVector,
+ InferenceEngine::SizeVector,
+ std::string> basicParams;
+
+
+/**
+ * Base class for bf16 tests
+ * the flow in this test assume to load network in FP32 and in BF16 modes and verify
+ * 1. difference between outptut's tensor with some treshold.
+ * 2. which preciosion was selected for layers described in runtime info of performance counters
+ *
+ * To develop new test you need to
+ * 1. define class inherriten from BasicBF16Test and implement SetUp(). For example:
+ *
+ * class ScaleshiftConv_x3_Eltwise : public BasicBF16Test {
+ * protected:
+ * void SetUp()override {
+ * fnPtr = std::make_shared<ngraph::Function>(ngraph::NodeVector{convNode3}, ngraph::ParameterVector{input1});
+
+ // STAGE1:
+ threshold = 9e-1;
+
+ // STAGE2:
+ // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+ // performance counters
+ expectedPrecisions["Add_4"] = "FP32";
+ expectedPrecisions["Convolution_6"] = "BF16";
+ expectedPrecisions["Convolution_7"] = "BF16";
+ expectedPrecisions["Add_8"] = "ndef";
+ * expectedPrecisions["Convolution_10"] = "BF16";
+ * }
+ * };
+ *
+ * 2. define test
+ * TEST_P(ScaleshiftConv_x3_Eltwise, CompareWithRefImpl) {
+ test();
+};
+ * 3. INSTANTIATE_TEST_CASE_P(bfloat16_NoReshape, ScaleshiftConv_x3_Eltwise,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ScaleshiftConv_x3_Eltwise::getTestCaseName);
+
+ *
+ * In 3rd stage do not forget bfloat16 preffix!
+ */
+class BasicBF16Test : public LayerTestsUtils::LayerTestsCommonClass<basicParams> {
+protected:
+ virtual std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision) = 0;
+
+public:
+ std::shared_ptr<ngraph::Function> fnPtr;
+ std::vector<float *> refOut;
+ InferenceEngine::SizeVector inputShapes, newInputShapes;
+ InferenceEngine::SizeVector refOutShape;
+ std::map<std::string, std::string> expectedPrecisions;
+ float threshold = 2e-2; // Is enough for tensor having abs maximum values less than 1
+
+ static std::string getTestCaseName(testing::TestParamInfo<basicParams> obj) {
+ InferenceEngine::Precision inputPrecision, netPrecision;
+ InferenceEngine::SizeVector inputShapes, newInputShapes;
+ std::string targetDevice;
+ std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = obj.param;
+
+ std::ostringstream result;
+ if (!newInputShapes.empty()) {
+ result << "Reshape_From=" << CommonTestUtils::vec2str(inputShapes);;
+ result << "_To=" << CommonTestUtils::vec2str(newInputShapes) << "_";
+ } else {
+ result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_";
+ }
+ result << "inPRC=" << inputPrecision.name() << "_";
+ result << "netPRC=" << netPrecision.name() << "_";
+ result << "targetDevice=" << targetDevice;
+ return result.str();
+ }
+
+ void test() {
+ if (!InferenceEngine::with_cpu_x86_bfloat16()) {
+ // on platforms which do not support bfloat16, we are disabling bf16 tests since there are no bf16 primitives,
+ // tests are useless on such platforms
+ return;
+ }
+ std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+ InferenceEngine::CNNNetwork cnnNet(fnPtr);
+
+ setNetInOutPrecision(cnnNet, inputPrecision);
+ std::string inputName = cnnNet.getInputsInfo().begin()->first;
+ std::string outputName = cnnNet.getOutputsInfo().begin()->first;
+ auto ie = InferenceEngine::Core();
+ // BF16 inference
+ std::map<std::string, std::string> options;
+ if (netPrecision == InferenceEngine::Precision::FP32) {
+ options[InferenceEngine::PluginConfigParams::KEY_ENFORCE_BF16] = InferenceEngine::PluginConfigParams::YES;
+ } else {
+ options[InferenceEngine::PluginConfigParams::KEY_ENFORCE_BF16] = InferenceEngine::PluginConfigParams::NO;
+ }
+ options[InferenceEngine::PluginConfigParams::KEY_PERF_COUNT] = InferenceEngine::PluginConfigParams::YES;
+ options[InferenceEngine::PluginConfigParams::KEY_DUMP_EXEC_GRAPH_AS_DOT] = "egraph_test";
+
+ auto exec_net1 = ie.LoadNetwork(cnnNet, targetDevice, options);
+ auto req1 = exec_net1.CreateInferRequest();
+
+ InferenceEngine::Blob::Ptr inBlob1 = req1.GetBlob(inputName);
+ BFloat16Helpers::fillInputsBySinValues(inBlob1);
+
+ req1.Infer();
+ auto outBlobBF16 = req1.GetBlob(outputName);
+ InferenceEngine::MemoryBlob::CPtr mout1 = InferenceEngine::as<InferenceEngine::MemoryBlob>(outBlobBF16);
+ ASSERT_NE(mout1, nullptr);
+ auto lm1 = mout1->rmap();
+
+ // FP32 infrence
+ // if netPrecision is not eq to the FP32 - change network precision and recreate network
+ InferenceEngine::CNNNetwork cnnNetFP32(createGraph(InferenceEngine::Precision::FP32));
+ std::string inputNameFP32 = cnnNetFP32.getInputsInfo().begin()->first;
+ std::string outputNameFP32 = cnnNetFP32.getOutputsInfo().begin()->first;
+ setNetInOutPrecision(cnnNetFP32, inputPrecision);
+ auto exec_net2 = ie.LoadNetwork(cnnNetFP32, targetDevice,
+ { { InferenceEngine::PluginConfigParams::KEY_ENFORCE_BF16, InferenceEngine::PluginConfigParams::NO } });
+ auto req2 = exec_net2.CreateInferRequest();
+
+
+ req2.SetBlob(inputNameFP32, inBlob1);
+
+ req2.Infer();
+ auto outBlobFP32 = req2.GetBlob(outputNameFP32);
+ InferenceEngine::MemoryBlob::CPtr mout2 = InferenceEngine::as<InferenceEngine::MemoryBlob>(outBlobFP32);
+ ASSERT_NE(mout2, nullptr);
+ auto lm2 = mout2->rmap();
+
+ // debug to figure out the maximum value in output tensors:
+ // std::cout << "Max in bfloat16 network by output " << outputName << ": " <<
+ // BFloat16Helpers::getMaxAbsValue(lm1.as<const float *>(), mout1->size()) << std::endl;
+ // std::cout << "Max in fp32 network by output " << outputNameFP32 << ": " <<
+ // BFloat16Helpers::getMaxAbsValue(lm2.as<const float *>(), mout2->size()) << std::endl;
+
+ FuncTestUtils::compareRawBuffers(lm1.as<const float *>(),
+ lm2.as<const float *>(),
+ mout1->size(), mout2->size(),
+ threshold);
+
+ // Stage2: verification of performance counters
+ std::pair<std::string, std::string> wrongLayer =
+ BFloat16Helpers::matchPerfCountPrecisionVsExpected(req1.GetPerformanceCounts(), expectedPrecisions);
+ if (wrongLayer.first != std::string("")) {
+ std::string layerInPerfCounts = wrongLayer.first + " " + wrongLayer.second;
+ std::string layerExpected = wrongLayer.first + " " + expectedPrecisions[wrongLayer.first];
+ ASSERT_EQ(layerInPerfCounts, layerExpected);
+ }
+ fnPtr.reset();
+ }
+};
+
+} // namespace LayerTestsDefinitions
+
+
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <functional>
+#include <map>
+#include <utility>
+
+#include <ie_core.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class ConvConv : public BasicBF16Test {
+protected:
+ std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+ // ScaleShift (FP32)
+ // |
+ // Conv (BF16)
+ // |
+ // Conv (BF16)
+
+ // multiply
+ auto input1 = std::make_shared<opset1::Parameter>(ngraph::element::f32, ngraph::Shape{1, 3, 40, 40});
+ auto const1 = opset1::Constant::create(ngraph::element::f32, Shape{1}, { 2.0f });
+ auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+ // add
+ auto const2 = opset1::Constant::create(ngraph::element::f32, Shape{1}, { 1.0f });
+ auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+ addNode->set_friendly_name("ADD_1");
+
+ // convolution
+ ngraph::Shape convFilterShape = { 3, 3, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ std::vector<float> weightValues;
+ weightValues.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValues.data(), weightValues.size());
+ auto weightsNode = std::make_shared<ngraph::opset1::Constant>(ngraph::element::f32, convFilterShape, weightValues);
+
+ std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+ addNode, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 1, 1 }), // pad begin
+ ngraph::CoordinateDiff({ 1, 1 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode1->set_friendly_name("CONV_1");
+
+ // Convolution
+ ngraph::Shape convFilterShape2 = { 3, 3, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ std::vector<float> weightValues2;
+ weightValues2.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValues2.data(), weightValues2.size());
+ auto weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::f32, convFilterShape2, weightValues2);
+ std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+ convNode1, weightsNode2,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 0, 0 }), // pad begin
+ ngraph::CoordinateDiff({ 0, 0 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode2->set_friendly_name("CONV_2");
+
+ return std::make_shared<ngraph::Function>(ngraph::NodeVector{convNode2}, ngraph::ParameterVector{input1});
+ }
+ void SetUp()override {
+ std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+ fnPtr = createGraph(netPrecision);
+
+ // STAGE1:
+ // the maximum values in the latest tensor for this test is 24.4. It would be safe to set threshold eq to 0.1
+ threshold = 0.3f;
+ // STAGE2:
+ // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+ // performance counters
+ expectedPrecisions["ADD_1"] = "FP32";
+ expectedPrecisions["CONV_1"] = "BF16";
+ expectedPrecisions["CONV_2"] = "BF16";
+ }
+};
+
+TEST_P(ConvConv, CompareWithRefImpl) {
+ test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, ConvConv,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ConvConv::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, ConvConv,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ConvConv::getTestCaseName);
+
+
+} // namespace LayerTestsDefinitions
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <vector>
+#include <string>
+
+#include <ie_core.hpp>
+#include "functional_test_utils/blob_utils.hpp"
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class ConvDWConvReLU : public BasicBF16Test {
+protected:
+ std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+ // scaleshift (FP32)
+ // |
+ // Conv (BF16)
+ // |
+ // Depthwise Conv (BF16, assuming explicit separte execution of kernel, not fused into prev convolution)
+ // |
+ // ReLU (Fused Info DW convolution)
+
+
+ // multiply
+ ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+ // multiply
+ auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+ input1->set_friendly_name("Input_1");
+ std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+ } else {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+ }
+ auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+ // add
+ std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+ } else {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+ }
+ auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+ addNode->set_friendly_name("ADD_1");
+
+ // convolution
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+ ngraph::Shape convFilterShape = { 3, 3, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValuesFP32;
+ weightValuesFP32.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+ } else {
+ std::vector<short> weightValuesBF16;
+ weightValuesBF16.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+ addNode, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 1, 1 }), // pad begin
+ ngraph::CoordinateDiff({ 1, 1 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode1->set_friendly_name("CONV_1");
+
+ // DW convolution
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode2 = nullptr;
+ ngraph::Shape convFilterShape2 = { 3, 1, 1, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValues2FP32;
+ weightValues2FP32.resize(3 * 1 * 1 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValues2FP32.data(), weightValues2FP32.size());
+ weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape2, weightValues2FP32);
+ } else {
+ std::vector<short> weightValues2BF16;
+ weightValues2BF16.resize(3 * 1 * 1 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValues2BF16.data(), weightValues2BF16.size());
+ weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape2, weightValues2BF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::GroupConvolution>(
+ convNode1, weightsNode2,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 1, 1 }), // pad begin
+ ngraph::CoordinateDiff({ 1, 1 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode2->set_friendly_name("CONV_2");
+
+ // ReLU
+ auto reluNode2 = std::make_shared<opset1::Relu>(convNode2);
+ reluNode2->set_friendly_name("RELU");
+
+ return std::make_shared<ngraph::Function>(reluNode2, ngraph::ParameterVector{input1});
+ }
+ void SetUp()override {
+ std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+ fnPtr = createGraph(netPrecision);
+
+ // STAGE1:
+ threshold = 0.4f; // maximum value in tensor is 54.89
+ // STAGE2:
+ // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+ // performance counters
+ expectedPrecisions["ADD_1"] = "FP32";
+ expectedPrecisions["CONV_1"] = "BF16";
+ expectedPrecisions["CONV_2"] = "BF16";
+ expectedPrecisions["RELU"] = "ndef";
+ }
+};
+
+TEST_P(ConvDWConvReLU, CompareWithRefImpl) {
+ test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, ConvDWConvReLU,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ConvDWConvReLU::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, ConvDWConvReLU,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::BF16),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ConvDWConvReLU::getTestCaseName);
+
+
+} // namespace LayerTestsDefinitions
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <map>
+#include <functional>
+#include <utility>
+
+#include <ie_core.hpp>
+#include <ie_plugin_config.hpp>
+
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class ConvReLUPoolConvReLUPool : public BasicBF16Test {
+protected:
+ std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+ // Convolution1 (FP32)
+ // |
+ // ReLU1 (Fused)
+ // |
+ // Pooling1 (FP32)
+ // |
+ // Convolution2 (BF16)
+ // |
+ // ReLU2 (Fused)
+ // |
+ // Pooling2 (BF16)
+ // |
+ // Convolution3 (BF16)
+
+
+ // STAGE1: construction of the GRAPH
+
+ ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+ // multiply
+ auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+ input1->set_friendly_name("Input_1");
+
+ // convolution1
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+ ngraph::Shape convFilterShape = { 3, 3, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValuesFP32;
+ weightValuesFP32.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+ } else {
+ std::vector<short> weightValuesBF16;
+ weightValuesBF16.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> convNode = std::make_shared<ngraph::opset1::Convolution>(
+ input1, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 0, 0 }), // pad begin
+ ngraph::CoordinateDiff({ 0, 0 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode->set_friendly_name("Convolution_1");
+
+ // ReLU
+ auto reluNode = std::make_shared<opset1::Relu>(convNode);
+ reluNode->set_friendly_name("ReLU_1");
+
+ // Pooling
+ auto avgpoolNode = std::make_shared<opset1::AvgPool>(reluNode,
+ Strides{1, 1},
+ Shape{1, 1},
+ Shape{1, 1},
+ Shape{2, 2},
+ true,
+ op::RoundingType::FLOOR);
+ avgpoolNode->set_friendly_name("AvgPool_1");
+
+ // convolution2
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode2 = nullptr;
+ ngraph::Shape convFilterShape2 = { 3, 3, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValuesFP32;
+ weightValuesFP32.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+ weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape2, weightValuesFP32);
+ } else {
+ std::vector<short> weightValuesBF16;
+ weightValuesBF16.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+ weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape2, weightValuesBF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+ avgpoolNode, weightsNode2,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 0, 0 }), // pad begin
+ ngraph::CoordinateDiff({ 0, 0 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode2->set_friendly_name("Convolution_2");
+
+ // ReLU
+ auto reluNode2 = std::make_shared<opset1::Relu>(convNode2);
+ reluNode2->set_friendly_name("ReLU_2");
+
+ // Pooling
+ auto maxpoolNode2 = std::make_shared<opset1::MaxPool>(reluNode2,
+ Strides{1, 1},
+ Shape{1, 1},
+ Shape{0, 0},
+ Shape{2, 2},
+ op::RoundingType::FLOOR);
+ maxpoolNode2->set_friendly_name("MaxPool_2");
+
+ // convolution3
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode3 = nullptr;
+ ngraph::Shape convFilterShape3 = { 3, 3, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValuesFP32;
+ weightValuesFP32.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+ weightsNode3 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape3, weightValuesFP32);
+ } else {
+ std::vector<short> weightValuesBF16;
+ weightValuesBF16.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+ weightsNode3 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape3, weightValuesBF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> convNode3 = std::make_shared<ngraph::opset1::Convolution>(
+ maxpoolNode2, weightsNode3,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 0, 0 }), // pad begin
+ ngraph::CoordinateDiff({ 0, 0 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode3->set_friendly_name("Convolution_3");
+
+
+
+
+ return std::make_shared<ngraph::Function>(ngraph::NodeVector{convNode3}, ngraph::ParameterVector{input1});
+ }
+ void SetUp()override {
+ std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+ fnPtr = createGraph(netPrecision);
+
+ threshold = 0.2f; // max value in the latest tensor for FP32 network is 9.8
+
+ // STAGE2:
+ // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+ // performance counters
+ expectedPrecisions["Convolution_1"] = "FP32";
+ expectedPrecisions["ReLU_1"] = "ndef";
+ expectedPrecisions["AvgPool_1"] = "FP32";
+ expectedPrecisions["Convolution_2"] = "BF16";
+ expectedPrecisions["ReLU_2"] = "ndef";
+ expectedPrecisions["MaxPool_2"] = "BF16";
+ expectedPrecisions["Convolution_3"] = "BF16";
+ }
+};
+
+TEST_P(ConvReLUPoolConvReLUPool, CompareWithRefImpl) {
+ test();
+};
+
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, ConvReLUPoolConvReLUPool,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ConvReLUPoolConvReLUPool::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, ConvReLUPoolConvReLUPool,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::BF16),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ConvReLUPoolConvReLUPool::getTestCaseName);
+
+} // namespace LayerTestsDefinitions
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <map>
+#include <functional>
+#include <utility>
+
+#include <ie_core.hpp>
+#include <ie_plugin_config.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class Faster100_5_1_1_Conv : public BasicBF16Test {
+protected:
+ std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+ // Power (FP32)
+ // |
+ // Convolution (BF16)
+
+ // STAGE1: constructin og the GRAPH
+ // multiply
+ ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+ // multiply
+ auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{10, 5, 1, 1});
+ input1->set_friendly_name("Input_1");
+ std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+ } else {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+ }
+ auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+ // add
+ std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+ } else {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+ }
+ auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+ addNode->set_friendly_name("Add_4");
+
+ // problematic convolution: 100x5x1x1
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+ ngraph::Shape convFilterShape = { 5, 5, 1, 1 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValues;
+ weightValues.resize(5 * 5 * 1 * 1, 0.f);
+ weightValues[0] = 1.0f;
+ weightValues[7] = 1.0f;
+ weightValues[11] = 1.0f;
+ weightValues[19] = 1.0f;
+ weightValues[23] = 1.0f;
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ngraph::element::f32, convFilterShape, weightValues);
+ } else {
+ std::vector<short> weightValuesBF16;
+ weightValuesBF16.resize(5 * 5 * 1 * 1, BFloat16Helpers::reducePrecisionBitwiseS(0.0f));
+ weightValuesBF16[0] = BFloat16Helpers::reducePrecisionBitwiseS(1.0f);
+ weightValuesBF16[7] = BFloat16Helpers::reducePrecisionBitwiseS(1.0f);
+ weightValuesBF16[11] = BFloat16Helpers::reducePrecisionBitwiseS(1.0f);
+ weightValuesBF16[19] = BFloat16Helpers::reducePrecisionBitwiseS(1.0f);
+ weightValuesBF16[23] = BFloat16Helpers::reducePrecisionBitwiseS(1.0f);
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> convNode = std::make_shared<ngraph::opset1::Convolution>(
+ addNode, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 0, 0 }), // pad begin
+ ngraph::CoordinateDiff({ 0, 0 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode->set_friendly_name("Convolution_6");
+
+
+ // ReLU
+ auto reluNode = std::make_shared<opset1::Relu>(convNode);
+
+ return std::make_shared<ngraph::Function>(ngraph::NodeVector{reluNode}, ngraph::ParameterVector{input1});
+ }
+
+ void SetUp()override {
+ std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+ fnPtr = createGraph(netPrecision);
+
+ // STAGE2:
+ // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+ // performance counters
+ expectedPrecisions["Add_4"] = "FP32";
+ expectedPrecisions["Convolution_6"] = "BF16";
+ }
+};
+
+TEST_P(Faster100_5_1_1_Conv, CompareWithRefImpl) {
+ test();
+};
+
+
+INSTANTIATE_TEST_CASE_P(bfloat16_NoReshape, Faster100_5_1_1_Conv,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(SizeVector({ 10, 5, 1, 1 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ Faster100_5_1_1_Conv::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, Faster100_5_1_1_Conv,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::BF16),
+ ::testing::Values(SizeVector({ 10, 5, 1, 1 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ Faster100_5_1_1_Conv::getTestCaseName);
+
+
+} // namespace LayerTestsDefinitions
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <vector>
+#include <string>
+
+#include <ie_core.hpp>
+#include "functional_test_utils/blob_utils.hpp"
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class MobileNet_ssd_with_branching : public BasicBF16Test {
+protected:
+ std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+ // scaleshift
+ // |
+ // Conv1 (FP32)
+ // | \
+ // Conv2 (FP32 so far while we have not greedy mode. This must be fixed. Such pattern shouild have Conv2 in BF16)
+ // | |
+ // relu(fused) |
+ // | Normalize (not LRN)
+ // Conv (DW)(BF16) |
+ // | |
+ // ReLU (Fused) |
+ // \ /
+ // Concat
+
+ ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+ // multiply
+ auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+ input1->set_friendly_name("Input_1");
+ std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+ } else {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+ }
+ auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+ // add
+ std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+ } else {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+ }
+ auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+ addNode->set_friendly_name("ADD_1");
+
+ // Conv1
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+ ngraph::Shape convFilterShape = { 3, 3, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValuesFP32;
+ weightValuesFP32.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+ } else {
+ std::vector<short> weightValuesBF16;
+ weightValuesBF16.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+ addNode, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 1, 1 }), // pad begin
+ ngraph::CoordinateDiff({ 1, 1 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode1->set_friendly_name("CONV_1");
+
+ // Conv2
+ std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+ convNode1, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 1, 1 }), // pad begin
+ ngraph::CoordinateDiff({ 1, 1 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode2->set_friendly_name("CONV_2");
+
+ // ReLU
+ auto reluNode = std::make_shared<opset1::Relu>(convNode2);
+ reluNode->set_friendly_name("RELU_2");
+
+ // DW convolution
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode2 = nullptr;
+ ngraph::Shape convFilterShape2 = { 3, 1, 1, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValues2FP32;
+ weightValues2FP32.resize(3 * 1 * 1 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValues2FP32.data(), weightValues2FP32.size());
+ weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape2, weightValues2FP32);
+ } else {
+ std::vector<short> weightValues2BF16;
+ weightValues2BF16.resize(3 * 1 * 1 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValues2BF16.data(), weightValues2BF16.size());
+ weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape2, weightValues2BF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> dwConvNode = std::make_shared<ngraph::opset1::GroupConvolution>(
+ reluNode, weightsNode2,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 1, 1 }), // pad begin
+ ngraph::CoordinateDiff({ 1, 1 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ dwConvNode->set_friendly_name("DW_CONV");
+
+ // ReLU
+ auto reluNode2 = std::make_shared<opset1::Relu>(dwConvNode);
+ reluNode2->set_friendly_name("RELU_DW");
+
+ // normalize
+ const auto axes = make_shared<op::Constant>(element::i64, Shape{2}, vector<int64_t>{2});
+ float eps{1e-6f};
+ auto eps_mode = op::EpsMode::ADD;
+
+ auto normNode = std::make_shared<opset1::NormalizeL2>(convNode1, axes, eps, eps_mode);
+ normNode->set_friendly_name("NORM_1");
+
+ // Concat
+ ngraph::NodeVector concInputNodes = { reluNode2, normNode };
+ auto concNode = std::make_shared<opset1::Concat>(concInputNodes, 1);
+ concNode->set_friendly_name("CONC_1");
+
+ return std::make_shared<ngraph::Function>(concNode, ngraph::ParameterVector{input1});
+ }
+
+ void SetUp()override {
+ std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+ fnPtr = createGraph(netPrecision);
+
+ // STAGE1:
+ threshold = 0.8f; // max value in latest tensor is 87.67
+ // STAGE2:
+ // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+ // performance counters
+ expectedPrecisions["ADD_1"] = "FP32";
+ expectedPrecisions["CONV_1"] = "BF16";
+ expectedPrecisions["CONV_2"] = "FP32";
+ expectedPrecisions["RELU_2"] = "ndef";
+ expectedPrecisions["DW_CONV"] = "BF16";
+ expectedPrecisions["RELU_DW"] = "ndef";
+ expectedPrecisions["NORM_1"] = "FP32";
+ expectedPrecisions["CONC_1"] = "FP32";
+ }
+};
+
+TEST_P(MobileNet_ssd_with_branching, CompareWithRefImpl) {
+ test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, MobileNet_ssd_with_branching,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ MobileNet_ssd_with_branching::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, MobileNet_ssd_with_branching,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::BF16),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ MobileNet_ssd_with_branching::getTestCaseName);
+
+} // namespace LayerTestsDefinitions
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <functional>
+#include <map>
+#include <utility>
+
+#include <ie_core.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class ScaleshiftConvEltwiseConv : public BasicBF16Test {
+protected:
+ std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+ // scaleshift (FP32) Conv (FP32)
+ // \ /
+ // Eltwise (Fused into Conv)
+ // |
+ // Conv (BF16)
+
+
+ ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+ // multiply
+ auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+ input1->set_friendly_name("Input_1");
+ std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+ } else {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+ }
+ auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+ // add
+ std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+ } else {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+ }
+ auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+ addNode->set_friendly_name("ADD_1");
+
+ // convolution
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+ ngraph::Shape convFilterShape = { 3, 3, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValuesFP32;
+ weightValuesFP32.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+ } else {
+ std::vector<short> weightValuesBF16;
+ weightValuesBF16.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+ input1, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 1, 1 }), // pad begin
+ ngraph::CoordinateDiff({ 1, 1 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode1->set_friendly_name("CONV_1");
+
+ // Eltwise, i.e. Add
+ auto eltNode = std::make_shared<opset1::Add>(addNode, convNode1);
+ eltNode->set_friendly_name("ELT_1");
+
+ // Convolution
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode2 = nullptr;
+ ngraph::Shape convFilterShape2 = { 3, 3, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValues2;
+ weightValues2.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValues2.data(), weightValues2.size());
+ weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape2, weightValues2);
+ } else {
+ std::vector<short> weightValues2BF16;
+ weightValues2BF16.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValues2BF16.data(), weightValues2BF16.size());
+ weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape2, weightValues2BF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+ eltNode, weightsNode2,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 0, 0 }), // pad begin
+ ngraph::CoordinateDiff({ 0, 0 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode2->set_friendly_name("CONV_2");
+
+ return std::make_shared<ngraph::Function>(ngraph::NodeVector{convNode2}, ngraph::ParameterVector{input1});
+ }
+
+ void SetUp()override {
+ std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+ fnPtr = createGraph(netPrecision);
+
+ // STAGE1:
+ threshold = 0.2f; // max value in the latest tensor for FP32 network is 37.77
+ // STAGE2:
+ // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+ // performance counters
+ expectedPrecisions["ADD_1"] = "FP32";
+ expectedPrecisions["CONV_1"] = "FP32";
+ expectedPrecisions["CONV_2"] = "BF16";
+ expectedPrecisions["ELT_1"] = "ndef";
+ }
+};
+
+TEST_P(ScaleshiftConvEltwiseConv, CompareWithRefImpl) {
+ test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, ScaleshiftConvEltwiseConv,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ScaleshiftConvEltwiseConv::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, ScaleshiftConvEltwiseConv,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::BF16),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ScaleshiftConvEltwiseConv::getTestCaseName);
+
+} // namespace LayerTestsDefinitions
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <functional>
+#include <map>
+#include <utility>
+
+#include <ie_core.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class ScaleshiftConvEltwiseReluConv : public BasicBF16Test {
+protected:
+ std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+ // scaleshift (FP32) Conv (FP32_
+ // \ /
+ // Eltwise (Fused into conv)
+ // |
+ // ReLU (Fused into conv)
+ // |
+ // Conv (BF16)
+
+ ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+ // multiply
+ auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+ input1->set_friendly_name("Input_1");
+ std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+ } else {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+ }
+ auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+ // add
+ std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+ } else {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+ }
+ auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+ addNode->set_friendly_name("ADD_1");
+
+ // convolution
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+ ngraph::Shape convFilterShape = { 3, 3, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValuesFP32;
+ weightValuesFP32.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+ } else {
+ std::vector<short> weightValuesBF16;
+ weightValuesBF16.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+ input1, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 1, 1 }), // pad begin
+ ngraph::CoordinateDiff({ 1, 1 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode1->set_friendly_name("CONV_1");
+
+ // Eltwise, i.e. Add
+ auto eltNode = std::make_shared<opset1::Add>(addNode, convNode1);
+ eltNode->set_friendly_name("ELT_1");
+
+ // ReLU
+ auto reluNode = std::make_shared<opset1::Relu>(eltNode);
+ reluNode->set_friendly_name("RELU_1");
+
+ // Convolution
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode2 = nullptr;
+ ngraph::Shape convFilterShape2 = { 3, 3, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValues2;
+ weightValues2.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValues2.data(), weightValues2.size());
+ weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape2, weightValues2);
+ } else {
+ std::vector<short> weightValues2BF16;
+ weightValues2BF16.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValues2BF16.data(), weightValues2BF16.size());
+ weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape2, weightValues2BF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+ reluNode, weightsNode2,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 0, 0 }), // pad begin
+ ngraph::CoordinateDiff({ 0, 0 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode2->set_friendly_name("CONV_2");
+
+ return std::make_shared<ngraph::Function>(ngraph::NodeVector{convNode2}, ngraph::ParameterVector{input1});
+ }
+ void SetUp()override {
+ std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+ fnPtr = createGraph(netPrecision);
+
+ // STAGE1:
+ threshold = 9e-2;
+ // STAGE2:
+ // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+ // performance counters
+ expectedPrecisions["ADD_1"] = "FP32";
+ expectedPrecisions["CONV_1"] = "FP32";
+ expectedPrecisions["CONV_2"] = "BF16";
+ expectedPrecisions["RELU_1"] = "ndef";
+ expectedPrecisions["ELT_1"] = "ndef";
+ }
+};
+
+TEST_P(ScaleshiftConvEltwiseReluConv, CompareWithRefImpl) {
+ test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, ScaleshiftConvEltwiseReluConv,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ScaleshiftConvEltwiseReluConv::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, ScaleshiftConvEltwiseReluConv,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::BF16),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ScaleshiftConvEltwiseReluConv::getTestCaseName);
+
+
+} // namespace LayerTestsDefinitions
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <functional>
+#include <map>
+#include <utility>
+
+#include <ie_core.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class ScaleshiftConvEltwiseScaleshift : public BasicBF16Test {
+protected:
+ std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+ // scaleshift (FP32)
+ // |
+ // Conv (BF16)
+ // \ /
+ // Eltwise (Fused into Conv)
+ // |
+ // scaleshift (FP32)
+
+ ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+ // multiply
+ auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+ input1->set_friendly_name("Input_1");
+ std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+ } else {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+ }
+ auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+ // add
+ std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+ } else {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+ }
+ auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+ addNode->set_friendly_name("ADD_1");
+
+ // convolution
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+ ngraph::Shape convFilterShape = { 3, 3, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValuesFP32;
+ weightValuesFP32.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+ } else {
+ std::vector<short> weightValuesBF16;
+ weightValuesBF16.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+ addNode, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 1, 1 }), // pad begin
+ ngraph::CoordinateDiff({ 1, 1 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode1->set_friendly_name("CONV_1");
+
+ // Eltwise, i.e. Add
+ auto eltNode = std::make_shared<opset1::Add>(input1, convNode1);
+ eltNode->set_friendly_name("ELT_1");
+
+ auto reluNode = std::make_shared<opset1::Relu>(eltNode);
+ reluNode->set_friendly_name("RELU_1");
+
+ // multiply
+ std::shared_ptr<ngraph::opset1::Constant> const3 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const3 = opset1::Constant::create(ntype, Shape{1}, { 3.0f });
+ } else {
+ const3 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(3.0f)) });
+ }
+ auto mulNode2 = std::make_shared<opset1::Multiply>(reluNode, const3);
+
+ // add
+ std::shared_ptr<ngraph::opset1::Constant> const4 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const4 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+ } else {
+ const4 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+ }
+ auto addNode2 = std::make_shared<opset1::Add>(mulNode2, const4);
+ addNode2->set_friendly_name("ADD_2");
+
+ return std::make_shared<ngraph::Function>(ngraph::NodeVector{addNode2}, ngraph::ParameterVector{input1});
+ }
+ void SetUp()override {
+ std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+ fnPtr = createGraph(netPrecision);
+
+ // STAGE1:
+ threshold = 0.4;
+ // STAGE2:
+ // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+ // performance counters
+ expectedPrecisions["ADD_1"] = "FP32";
+ expectedPrecisions["CONV_1"] = "BF16";
+ expectedPrecisions["ADD_2"] = "FP32";
+ expectedPrecisions["ELT_1"] = "ndef";
+ }
+};
+
+TEST_P(ScaleshiftConvEltwiseScaleshift, CompareWithRefImpl) {
+ test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, ScaleshiftConvEltwiseScaleshift,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ScaleshiftConvEltwiseScaleshift::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, ScaleshiftConvEltwiseScaleshift,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::BF16),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ScaleshiftConvEltwiseScaleshift::getTestCaseName);
+
+
+} // namespace LayerTestsDefinitions
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <functional>
+#include <map>
+#include <utility>
+
+#include <ie_core.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class ScaleshiftConvEluConv : public BasicBF16Test {
+protected:
+ std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+ // scaleshift (FP32)
+ // |
+ // Conv (BF16)
+ // |
+ // Elu (FP32 for now, this must be fixed and it must be fused into Conv)
+ // |
+ // Conv (BF16)
+
+ ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+ // multiply
+ auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+ input1->set_friendly_name("Input_1");
+ std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+ } else {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+ }
+ auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+ // add
+ std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+ } else {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+ }
+ auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+ addNode->set_friendly_name("ADD_1");
+
+ // convolution
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+ ngraph::Shape convFilterShape = { 3, 3, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValuesFP32;
+ weightValuesFP32.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+ } else {
+ std::vector<short> weightValuesBF16;
+ weightValuesBF16.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+ addNode, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 1, 1 }), // pad begin
+ ngraph::CoordinateDiff({ 1, 1 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode1->set_friendly_name("CONV_1");
+
+ // Elu
+ auto eluNode = std::make_shared<opset1::Elu>(convNode1, 2);
+ eluNode->set_friendly_name("ELU_1");
+
+ // Conv
+ std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+ eluNode, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 1, 1 }), // pad begin
+ ngraph::CoordinateDiff({ 1, 1 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode2->set_friendly_name("CONV_2");
+
+ return std::make_shared<ngraph::Function>(ngraph::NodeVector{convNode2}, ngraph::ParameterVector{input1});
+ }
+ void SetUp()override {
+ std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+ fnPtr = createGraph(netPrecision);
+
+ // STAGE1:
+ threshold = 1;
+ // STAGE2:
+ // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+ // performance counters
+ expectedPrecisions["ADD_1"] = "FP32";
+ expectedPrecisions["CONV_1"] = "BF16";
+ expectedPrecisions["ELU_1"] = "FP32";
+ expectedPrecisions["CONV_2"] = "BF16";
+ }
+};
+
+TEST_P(ScaleshiftConvEluConv, CompareWithRefImpl) {
+ test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, ScaleshiftConvEluConv,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ScaleshiftConvEluConv::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, ScaleshiftConvEluConv,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::BF16),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ScaleshiftConvEluConv::getTestCaseName);
+
+
+} // namespace LayerTestsDefinitions
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <functional>
+#include <map>
+#include <utility>
+
+#include <ie_core.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class ScaleshiftConvRelu : public BasicBF16Test {
+protected:
+ std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+ // scaleshift (FP32)
+ // |
+ // Conv (BF16)
+ // |
+ // relu (Fused into convolution)
+
+ ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+ // multiply
+ auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+ input1->set_friendly_name("Input_1");
+ std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+ } else {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+ }
+ auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+ // add
+ std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+ } else {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+ }
+ auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+ addNode->set_friendly_name("ADD_1");
+
+ // convolution
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+ ngraph::Shape convFilterShape = { 3, 3, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValuesFP32;
+ weightValuesFP32.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+ } else {
+ std::vector<short> weightValuesBF16;
+ weightValuesBF16.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+ addNode, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 1, 1 }), // pad begin
+ ngraph::CoordinateDiff({ 1, 1 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode1->set_friendly_name("CONV_1");
+
+ // ReLU
+ auto reluNode = std::make_shared<opset1::Relu>(convNode1);
+ reluNode->set_friendly_name("RELU_1");
+
+ return std::make_shared<ngraph::Function>(ngraph::NodeVector{reluNode}, ngraph::ParameterVector{input1});
+ }
+ void SetUp()override {
+ std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+ fnPtr = createGraph(netPrecision);
+
+ // STAGE1:
+ threshold = 5e-2;
+ // STAGE2:
+ // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+ // performance counters
+ expectedPrecisions["ADD_1"] = "FP32";
+ expectedPrecisions["CONV_1"] = "BF16";
+ expectedPrecisions["RELU_1"] = "ndef";
+ }
+};
+
+TEST_P(ScaleshiftConvRelu, CompareWithRefImpl) {
+ test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, ScaleshiftConvRelu,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ScaleshiftConvRelu::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, ScaleshiftConvRelu,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::BF16),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ScaleshiftConvRelu::getTestCaseName);
+
+} // namespace LayerTestsDefinitions
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <functional>
+#include <map>
+#include <utility>
+
+#include <ie_core.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class ScaleshiftConv_x2_ConcatRelu : public BasicBF16Test {
+protected:
+ std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+ // scaleshift
+ // / \
+ // Conv Conv
+ // \ /
+ // concat
+ // |
+ // relu
+
+ ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+ // multiply
+ auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+ input1->set_friendly_name("Input_1");
+ std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+ } else {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+ }
+ auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+ // add
+ std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+ } else {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+ }
+ auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+ addNode->set_friendly_name("ADD_1");
+
+ // convolution
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+ ngraph::Shape convFilterShape = { 3, 3, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValuesFP32;
+ weightValuesFP32.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+ } else {
+ std::vector<short> weightValuesBF16;
+ weightValuesBF16.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+ addNode, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 1, 1 }), // pad begin
+ ngraph::CoordinateDiff({ 1, 1 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode1->set_friendly_name("CONV_1");
+
+ std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+ addNode, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 1, 1 }), // pad begin
+ ngraph::CoordinateDiff({ 1, 1 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode2->set_friendly_name("CONV_2");
+
+ // Concat
+ ngraph::NodeVector concInputNodes = { convNode1, convNode2 };
+
+ // test is to be failed, if axis == 1 - TODO
+ auto concNode = std::make_shared<opset1::Concat>(concInputNodes, 2);
+ concNode->set_friendly_name("CONC_1");
+
+ // ReLU
+ auto reluNode = std::make_shared<opset1::Relu>(concNode);
+ reluNode->set_friendly_name("RELU_1");
+
+ return std::make_shared<ngraph::Function>(ngraph::NodeVector{reluNode}, ngraph::ParameterVector{input1});
+ }
+
+ void SetUp()override {
+ std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+ fnPtr = createGraph(netPrecision);
+
+ // STAGE1:
+ threshold = 10e-1;
+ // STAGE2:
+ // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+ // performance counters
+ expectedPrecisions["ADD_1"] = "FP32";
+ expectedPrecisions["CONV_1"] = "BF16";
+ expectedPrecisions["CONV_2"] = "BF16";
+ expectedPrecisions["CONC_1"] = "FP32";
+ expectedPrecisions["RELU_1"] = "FP32";
+ }
+};
+
+TEST_P(ScaleshiftConv_x2_ConcatRelu, CompareWithRefImpl) {
+ test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, ScaleshiftConv_x2_ConcatRelu,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ScaleshiftConv_x2_ConcatRelu::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, ScaleshiftConv_x2_ConcatRelu,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::BF16),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ScaleshiftConv_x2_ConcatRelu::getTestCaseName);
+
+
+} // namespace LayerTestsDefinitions
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <functional>
+#include <map>
+#include <utility>
+
+#include <ie_core.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class ScaleshiftConv_x2_Eltwise : public BasicBF16Test {
+protected:
+ std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+ // scaleshift (FP32)
+ // / \
+ // Conv1 (BF16) Conv1 (BF16)
+ // \ /
+ // eltwise (Fused into Conv1) produce FP32 output
+
+ ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+ // multiply
+ auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+ input1->set_friendly_name("Input_1");
+ std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+ } else {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+ }
+ auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+ // add
+ std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+ } else {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+ }
+ auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+ addNode->set_friendly_name("ADD_1");
+
+ // convolution
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+ ngraph::Shape convFilterShape = { 3, 3, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValuesFP32;
+ weightValuesFP32.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+ } else {
+ std::vector<short> weightValuesBF16;
+ weightValuesBF16.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+ addNode, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 1, 1 }), // pad begin
+ ngraph::CoordinateDiff({ 1, 1 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode1->set_friendly_name("CONV_1");
+
+ std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+ addNode, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 1, 1 }), // pad begin
+ ngraph::CoordinateDiff({ 1, 1 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode2->set_friendly_name("CONV_2");
+
+ // Eltwise, i.e. Add
+ auto eltNode = std::make_shared<opset1::Add>(convNode1, convNode2);
+ eltNode->set_friendly_name("ELT_1");
+ return std::make_shared<ngraph::Function>(ngraph::NodeVector{eltNode}, ngraph::ParameterVector{input1});
+ }
+
+ void SetUp()override {
+ std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+ fnPtr = createGraph(netPrecision);
+
+ // STAGE1:
+ threshold = 2e-1;
+ // STAGE2:
+ // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+ // performance counters
+ expectedPrecisions["ADD_1"] = "FP32";
+ expectedPrecisions["CONV_1"] = "BF16";
+ expectedPrecisions["CONV_2"] = "BF16";
+ expectedPrecisions["ELT_1"] = "ndef";
+ }
+};
+
+TEST_P(ScaleshiftConv_x2_Eltwise, CompareWithRefImpl) {
+ test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, ScaleshiftConv_x2_Eltwise,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ScaleshiftConv_x2_Eltwise::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, ScaleshiftConv_x2_Eltwise,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::BF16),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ScaleshiftConv_x2_Eltwise::getTestCaseName);
+
+
+} // namespace LayerTestsDefinitions
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <functional>
+#include <map>
+#include <utility>
+
+#include <ie_core.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class ScaleshiftConv_x2_mixed1_Eltwise : public BasicBF16Test {
+protected:
+ std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+ // scaleshift (FP32)
+ // | |
+ // Conv1(BF16) Conv2(FP32)
+ // \ /
+ // eltwise(Fused into Conv1)
+
+ ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+ // multiply
+ auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+ input1->set_friendly_name("Input_1");
+ std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+ } else {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+ }
+ auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+ // add
+ std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+ } else {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+ }
+ auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+ addNode->set_friendly_name("ADD_1");
+
+ // convolution
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+ ngraph::Shape convFilterShape = { 3, 3, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValuesFP32;
+ weightValuesFP32.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+ } else {
+ std::vector<short> weightValuesBF16;
+ weightValuesBF16.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+ addNode, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 1, 1 }), // pad begin
+ ngraph::CoordinateDiff({ 1, 1 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode1->set_friendly_name("CONV_1");
+
+ std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+ input1, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 1, 1 }), // pad begin
+ ngraph::CoordinateDiff({ 1, 1 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode2->set_friendly_name("CONV_2");
+
+ // Eltwise, i.e. Add
+ auto eltNode = std::make_shared<opset1::Add>(convNode1, convNode2);
+ eltNode->set_friendly_name("ELT_1");
+ return std::make_shared<ngraph::Function>(ngraph::NodeVector{eltNode}, ngraph::ParameterVector{input1});
+ }
+
+ void SetUp()override {
+ std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+ fnPtr = createGraph(netPrecision);
+
+ // STAGE1:
+ threshold = 2e-1;
+ // STAGE2:
+ // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+ // performance counters
+ expectedPrecisions["ADD_1"] = "FP32";
+ expectedPrecisions["CONV_1"] = "BF16";
+ expectedPrecisions["CONV_2"] = "FP32";
+ expectedPrecisions["ELT_1"] = "ndef";
+ }
+};
+
+TEST_P(ScaleshiftConv_x2_mixed1_Eltwise, CompareWithRefImpl) {
+ test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, ScaleshiftConv_x2_mixed1_Eltwise,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ScaleshiftConv_x2_mixed1_Eltwise::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, ScaleshiftConv_x2_mixed1_Eltwise,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::BF16),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ScaleshiftConv_x2_mixed1_Eltwise::getTestCaseName);
+
+
+} // namespace LayerTestsDefinitions
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <functional>
+#include <map>
+#include <utility>
+
+#include <ie_core.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class ScaleshiftConv_x2_mixed2_Eltwise : public BasicBF16Test {
+protected:
+ std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+ // scaleshift (FP32)
+ // | |
+ // Conv1 (FP32) Conv2 (Bf16)
+ // \ /
+ // eltwise (Fused into Conv1)
+
+ ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+ auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+
+ // convolution
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+ ngraph::Shape convFilterShape = { 3, 3, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValuesFP32;
+ weightValuesFP32.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+ } else {
+ std::vector<short> weightValuesBF16;
+ weightValuesBF16.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+ input1, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 1, 1 }), // pad begin
+ ngraph::CoordinateDiff({ 1, 1 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode1->set_friendly_name("CONV_1");
+
+ // multiply
+ input1->set_friendly_name("Input_1");
+ std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+ } else {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+ }
+ auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+ // add
+ std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+ } else {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+ }
+ auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+ addNode->set_friendly_name("ADD_2");
+
+ // convolution
+ std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+ addNode, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 1, 1 }), // pad begin
+ ngraph::CoordinateDiff({ 1, 1 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode2->set_friendly_name("CONV_2");
+
+ // Eltwise, i.e. Add
+ auto eltNode = std::make_shared<opset1::Add>(convNode1, convNode2);
+ eltNode->set_friendly_name("ELT_1");
+
+ return std::make_shared<ngraph::Function>(ngraph::NodeVector{eltNode}, ngraph::ParameterVector{input1});
+ }
+
+ void SetUp()override {
+ std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+ fnPtr = createGraph(netPrecision);
+
+ // STAGE1:
+ threshold = 2e-1;
+ // STAGE2:
+ // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+ // performance counters
+ expectedPrecisions["CONV_1"] = "FP32";
+ expectedPrecisions["ADD_2"] = "FP32";
+ expectedPrecisions["CONV_2"] = "BF16";
+ expectedPrecisions["ELT_1"] = "ndef";
+ }
+};
+
+TEST_P(ScaleshiftConv_x2_mixed2_Eltwise, CompareWithRefImpl) {
+ test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, ScaleshiftConv_x2_mixed2_Eltwise,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ScaleshiftConv_x2_mixed2_Eltwise::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, ScaleshiftConv_x2_mixed2_Eltwise,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::BF16),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ScaleshiftConv_x2_mixed2_Eltwise::getTestCaseName);
+} // namespace LayerTestsDefinitions
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <functional>
+#include <map>
+#include <utility>
+
+#include <ie_core.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class ScaleshiftConv_x3_Eltwise : public BasicBF16Test {
+protected:
+ std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+ // scaleshift (FP32)
+ //
+ // / \
+ //
+ // Conv1 (BF16) Conv2 (BF16)
+ //
+ // \ /
+ //
+ // Eltwise (Fused to Conv1)
+ //
+ // |
+ //
+ // Conv3 (BF16)
+
+ ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+ // multiply
+ auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+ input1->set_friendly_name("Input_1");
+ std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+ } else {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+ }
+ auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+ // add
+ std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+ } else {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+ }
+ auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+ addNode->set_friendly_name("Add_1");
+
+ // convolution
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+ ngraph::Shape convFilterShape = { 16, 3, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValuesFP32;
+ weightValuesFP32.resize(16 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+ } else {
+ std::vector<short> weightValuesBF16;
+ weightValuesBF16.resize(16 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+ addNode, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 0, 0 }), // pad begin
+ ngraph::CoordinateDiff({ 0, 0 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode1->set_friendly_name("Convolution_1");
+ std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+ addNode, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 0, 0 }), // pad begin
+ ngraph::CoordinateDiff({ 0, 0 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode2->set_friendly_name("Convolution_2");
+
+ // Eltwise, i.e. Add
+ auto eltNode = std::make_shared<opset1::Add>(convNode1, convNode2);
+ eltNode->set_friendly_name("ELT_1");
+
+
+ // Convolution
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode3 = nullptr;
+ ngraph::Shape convFilterShape3 = { 16, 16, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValuesFP32;
+ weightValuesFP32.resize(16 * 16 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+ weightsNode3 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape3, weightValuesFP32);
+ } else {
+ std::vector<short> weightValuesBF16;
+ weightValuesBF16.resize(16 * 16 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+ weightsNode3 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape3, weightValuesBF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> convNode3 = std::make_shared<ngraph::opset1::Convolution>(
+ eltNode, weightsNode3,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 0, 0 }), // pad begin
+ ngraph::CoordinateDiff({ 0, 0 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode3->set_friendly_name("Convolution_3");
+
+ return std::make_shared<ngraph::Function>(ngraph::NodeVector{convNode3}, ngraph::ParameterVector{input1});
+ }
+
+ void SetUp()override {
+ std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+ fnPtr = createGraph(netPrecision);
+
+ // STAGE1:
+ threshold = 1.0f; // max value in the latest tensor for FP32 network is 93.3
+
+ // STAGE2:
+ // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+ // performance counters
+ expectedPrecisions["Add_1"] = "FP32";
+ expectedPrecisions["Convolution_1"] = "BF16";
+ expectedPrecisions["Convolution_2"] = "BF16";
+ expectedPrecisions["ELT_1"] = "ndef";
+ expectedPrecisions["Convolution_3"] = "BF16";
+ }
+};
+
+TEST_P(ScaleshiftConv_x3_Eltwise, CompareWithRefImpl) {
+ test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, ScaleshiftConv_x3_Eltwise,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ScaleshiftConv_x3_Eltwise::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, ScaleshiftConv_x3_Eltwise,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::BF16),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ ScaleshiftConv_x3_Eltwise::getTestCaseName);
+} // namespace LayerTestsDefinitions
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <functional>
+#include <map>
+#include <utility>
+
+#include <ie_core.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class Scaleshift_x2_Conv_x2_Eltwise : public BasicBF16Test {
+protected:
+ std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+ //
+ // scaleshift (FP32) scaleshift (FP32)
+ // \ / \
+ // Eltwise (FP32) Conv (BF16)
+ // | |
+ // Conv (BF16)
+ // |
+
+ ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+ // multiply
+ auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+ input1->set_friendly_name("Input_1");
+ std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+ } else {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+ }
+ auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+ // add
+ std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+ } else {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+ }
+ auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+ addNode->set_friendly_name("Add_1");
+
+ // multiply
+ std::shared_ptr<ngraph::opset1::Constant> const3 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const3 = opset1::Constant::create(ntype, Shape{1}, { 3.0f });
+ } else {
+ const3 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(3.0f)) });
+ }
+ auto mulNode2 = std::make_shared<opset1::Multiply>(input1, const3);
+
+ // add
+ std::shared_ptr<ngraph::opset1::Constant> const4 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const4 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+ } else {
+ const4 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+ }
+ auto addNode2 = std::make_shared<opset1::Add>(mulNode2, const4);
+ addNode2->set_friendly_name("Add_2");
+
+ // Eltwise, i.e. Add
+ auto eltNode = std::make_shared<opset1::Add>(addNode, addNode2);
+ eltNode->set_friendly_name("ELT_1");
+
+ // convolution
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+ ngraph::Shape convFilterShape = { 3, 3, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValuesFP32;
+ weightValuesFP32.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+ } else {
+ std::vector<short> weightValuesBF16;
+ weightValuesBF16.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+ addNode2, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 1, 1 }), // pad begin
+ ngraph::CoordinateDiff({ 1, 1 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode1->set_friendly_name("Convolution_1");
+
+ std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+ convNode1, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 1, 1 }), // pad begin
+ ngraph::CoordinateDiff({ 1, 1 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode2->set_friendly_name("Convolution_2");
+
+ return std::make_shared<ngraph::Function>(ngraph::NodeVector{eltNode, convNode2}, ngraph::ParameterVector{input1});
+ }
+ void SetUp()override {
+ std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+ fnPtr = createGraph(netPrecision);
+
+ // STAGE1:
+ threshold = 1;
+
+ // STAGE2:
+ // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+ // performance counters
+ expectedPrecisions["Add_1"] = "FP32";
+ expectedPrecisions["Add_2"] = "FP32";
+ expectedPrecisions["Convolution_1"] = "BF16";
+ expectedPrecisions["Convolution_2"] = "BF16";
+ expectedPrecisions["ELT_1"] = "FP32";
+ }
+};
+
+TEST_P(Scaleshift_x2_Conv_x2_Eltwise, CompareWithRefImpl) {
+ test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, Scaleshift_x2_Conv_x2_Eltwise,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ Scaleshift_x2_Conv_x2_Eltwise::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, Scaleshift_x2_Conv_x2_Eltwise,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::BF16),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ Scaleshift_x2_Conv_x2_Eltwise::getTestCaseName);
+
+
+} // namespace LayerTestsDefinitions
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <functional>
+#include <map>
+#include <utility>
+
+#include <ie_core.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class Scaleshift_x3_ConvEltwiseRelu : public BasicBF16Test {
+protected:
+ std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+ //
+ // scaleshift (FP32)
+ // |
+ // Conv (BF16) scaleshift (FP32
+ //
+ // \ /
+ //
+ // Eltwise (Fused to Conv)
+ // |
+ // ReLU (Fused to Conv)
+ // |
+ // scaleshift (FP32)
+
+ ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+
+ // multiply
+ auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+ std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+ } else {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+ }
+ auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+ // add
+ std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+ } else {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+ }
+ auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+ addNode->set_friendly_name("Add_1");
+
+ // convolution
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+ ngraph::Shape convFilterShape = { 3, 3, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValuesFP32;
+ weightValuesFP32.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+ } else {
+ std::vector<short> weightValuesBF16;
+ weightValuesBF16.resize(3 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+ addNode, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 1, 1 }), // pad begin
+ ngraph::CoordinateDiff({ 1, 1 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode1->set_friendly_name("Convolution_1");
+
+ // multiply
+ std::shared_ptr<ngraph::opset1::Constant> const3 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const3 = opset1::Constant::create(ntype, Shape{1}, { 3.0f });
+ } else {
+ const3 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(3.0f)) });
+ }
+ auto mulNode2 = std::make_shared<opset1::Multiply>(input1, const3);
+
+ // add
+ std::shared_ptr<ngraph::opset1::Constant> const4 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const4 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+ } else {
+ const4 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+ }
+ auto addNode2 = std::make_shared<opset1::Add>(mulNode2, const4);
+ addNode2->set_friendly_name("Add_2");
+
+ // Eltwise, i.e. Add
+ auto eltNode = std::make_shared<opset1::Add>(convNode1, addNode2);
+ eltNode->set_friendly_name("ELT_1");
+
+ // ReLU
+ auto reluNode = std::make_shared<opset1::Relu>(eltNode);
+ reluNode->set_friendly_name("RELU_1");
+
+ // multiply
+ std::shared_ptr<ngraph::opset1::Constant> const5 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const5 = opset1::Constant::create(ntype, Shape{1}, { 4.0f });
+ } else {
+ const5 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(4.0f)) });
+ }
+ auto mulNode3 = std::make_shared<opset1::Multiply>(reluNode, const5);
+
+ // add
+ std::shared_ptr<ngraph::opset1::Constant> const6 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const6 = opset1::Constant::create(ntype, Shape{1}, { 3.0f });
+ } else {
+ const6 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(3.0f)) });
+ }
+ auto addNode3 = std::make_shared<opset1::Add>(mulNode3, const6);
+ addNode3->set_friendly_name("Add_3");
+
+ return std::make_shared<ngraph::Function>(ngraph::NodeVector{addNode3}, ngraph::ParameterVector{input1});
+ }
+
+ void SetUp()override {
+ std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+ fnPtr = createGraph(netPrecision);
+
+ // STAGE1:
+ threshold = 2e-1;
+
+ // STAGE2:
+ // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+ // performance counters
+ expectedPrecisions["Add_1"] = "FP32";
+ expectedPrecisions["Convolution_1"] = "BF16";
+ expectedPrecisions["Add_2"] = "FP32";
+ expectedPrecisions["ELT_1"] = "ndef";
+ expectedPrecisions["RELU_1"] = "ndef";
+ expectedPrecisions["Add_3"] = "FP32";
+ }
+};
+
+ TEST_P(Scaleshift_x3_ConvEltwiseRelu, CompareWithRefImpl) {
+ test();
+ };
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, Scaleshift_x3_ConvEltwiseRelu,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ Scaleshift_x3_ConvEltwiseRelu::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, Scaleshift_x3_ConvEltwiseRelu,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::BF16),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ Scaleshift_x3_ConvEltwiseRelu::getTestCaseName);
+
+
+
+} // namespace LayerTestsDefinitions
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <map>
+#include <functional>
+#include <utility>
+
+#include <ie_core.hpp>
+#include <ie_plugin_config.hpp>
+
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class PoolingAfterConv : public BasicBF16Test {
+protected:
+ std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+ // Scaleshift (FP32)
+ // |
+ // Convolution (BF16)
+ // |
+ // ReLU (Fused)
+ // |
+ // Pooling (FP32) <- this layer can be be executed in bf16 if it passes data to next bf16 layer
+ // in other case there should be tail optimization and return Pooling to FP32
+
+ // STAGE1: construction of the GRAPH
+
+ ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+ // multiply
+ auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+ input1->set_friendly_name("Input_1");
+ std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+ } else {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+ }
+ auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+ // add
+ std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+ } else {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+ }
+ auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+ addNode->set_friendly_name("Add_4");
+
+ // convolution
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+ ngraph::Shape convFilterShape = { 16, 3, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValuesFP32;
+ weightValuesFP32.resize(16 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+ } else {
+ std::vector<short> weightValuesBF16;
+ weightValuesBF16.resize(16 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> convNode = std::make_shared<ngraph::opset1::Convolution>(
+ addNode, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 0, 0 }), // pad begin
+ ngraph::CoordinateDiff({ 0, 0 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode->set_friendly_name("Convolution_6");
+
+ // ReLU
+ auto reluNode = std::make_shared<opset1::Relu>(convNode);
+
+ // Pooling
+ auto avgpoolNode = std::make_shared<opset1::AvgPool>(reluNode,
+ Strides{1, 1},
+ Shape{1, 1},
+ Shape{1, 1},
+ Shape{2, 2},
+ true,
+ op::RoundingType::FLOOR);
+ avgpoolNode->set_friendly_name("AvgPool_8");
+
+ return std::make_shared<ngraph::Function>(ngraph::NodeVector{avgpoolNode}, ngraph::ParameterVector{input1});
+ }
+ void SetUp()override {
+ std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+ fnPtr = createGraph(netPrecision);
+
+ threshold = 0.14f; // max value in the latest tensor for FP32 network is 14.6448
+
+ // STAGE2:
+ // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+ // performance counters
+ expectedPrecisions["Add_4"] = "FP32";
+ expectedPrecisions["Convolution_6"] = "BF16";
+ expectedPrecisions["AvgPool_8"] = "FP32";
+ }
+};
+
+TEST_P(PoolingAfterConv, CompareWithRefImpl) {
+ test();
+};
+
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, PoolingAfterConv,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ PoolingAfterConv::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, PoolingAfterConv,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::BF16),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ PoolingAfterConv::getTestCaseName);
+
+} // namespace LayerTestsDefinitions
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <map>
+#include <functional>
+#include <utility>
+
+#include <ie_core.hpp>
+#include <ie_plugin_config.hpp>
+
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class TopKInputsI32 : public BasicBF16Test {
+protected:
+ std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+ // Power (FP32)
+ // |
+ // Convolution1 (BF16) Const (I32)
+ // | |
+ // \ /
+ // TopK (FP32)
+ // (BF16)/ \ (I32)
+ // |
+ // Convolution 2
+
+ // STAGE1: construction of the GRAPH
+
+ ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+ // multiply
+ auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+ input1->set_friendly_name("Input_1");
+ std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+ } else {
+ const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+ }
+ auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+ // add
+ std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+ if (netPrecision == Precision::FP32) {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+ } else {
+ const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+ }
+ auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+ addNode->set_friendly_name("Add_4");
+
+ // convolution
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+ ngraph::Shape convFilterShape = { 16, 3, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValuesFP32;
+ weightValuesFP32.resize(16 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+ } else {
+ std::vector<short> weightValuesBF16;
+ weightValuesBF16.resize(16 * 3 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+ weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> convNode = std::make_shared<ngraph::opset1::Convolution>(
+ addNode, weightsNode,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 0, 0 }), // pad begin
+ ngraph::CoordinateDiff({ 0, 0 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode->set_friendly_name("Convolution_1");
+
+ // TopK
+ const auto k = make_shared<op::Constant>(element::i32, Shape{}, vector<int>{1});
+ size_t axis = 1;
+ ngraph::op::v1::TopK::Mode mode = ngraph::op::v1::TopK::Mode::MAX;
+ ngraph::op::v1::TopK::SortType sort = ngraph::op::v1::TopK::SortType::NONE;
+ auto argmaxNode = std::make_shared<opset1::TopK>(convNode, k, axis, mode, sort);
+ argmaxNode->set_friendly_name("TopK_1");
+
+ auto goe0 = make_shared<op::GetOutputElement>(argmaxNode, 0);
+ auto goe1 = make_shared<op::GetOutputElement>(argmaxNode, 1);
+
+ // convolution
+ std::shared_ptr<ngraph::opset1::Constant> weightsNode2 = nullptr;
+ ngraph::Shape convFilterShape2 = { 1, 1, 3, 3 }; // out channel, /input channels, kernel h, kernel w
+ if (netPrecision == Precision::FP32) {
+ std::vector<float> weightValuesFP32;
+ weightValuesFP32.resize(1 * 1 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+ weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape2, weightValuesFP32);
+ } else {
+ std::vector<short> weightValuesBF16;
+ weightValuesBF16.resize(1 * 1 * 3 * 3);
+ BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+ weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape2, weightValuesBF16.data());
+ }
+
+ std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+ goe0, weightsNode2,
+ ngraph::Strides({ 1, 1 }), // strides
+ ngraph::CoordinateDiff({ 0, 0 }), // pad begin
+ ngraph::CoordinateDiff({ 0, 0 }), // pad end
+ ngraph::Strides({ 1, 1 }), // dilation
+ ngraph::op::PadType::EXPLICIT); // pad type
+ convNode2->set_friendly_name("Convolution_2");
+
+ return std::make_shared<ngraph::Function>(ngraph::NodeVector{convNode2, goe1}, ngraph::ParameterVector{input1});
+ }
+ void SetUp()override {
+ std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+ fnPtr = createGraph(netPrecision);
+
+ threshold = 0.14f; // max value in the latest tensor for FP32 network is 22.6
+
+ // STAGE2:
+ // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+ // performance counters
+ expectedPrecisions["Add_4"] = "FP32";
+ expectedPrecisions["Convolution_1"] = "BF16";
+ expectedPrecisions["Convolution_2"] = "BF16";
+ expectedPrecisions["TopK_1"] = "FP32";
+ }
+};
+
+TEST_P(TopKInputsI32, CompareWithRefImpl) {
+ test();
+};
+
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, TopKInputsI32,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ TopKInputsI32::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, TopKInputsI32,
+ ::testing::Combine(
+ ::testing::Values(Precision::FP32),
+ ::testing::Values(Precision::BF16),
+ ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+ ::testing::Values(SizeVector()),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+ TopKInputsI32::getTestCaseName);
+
+} // namespace LayerTestsDefinitions
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
+//
// SPDX-License-Identifier: Apache-2.0
//
--- /dev/null
+// Copyright (C) 2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "single_layer_tests/reshape.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+using namespace LayerTestsDefinitions;
+
+namespace {
+// Common params
+const std::vector<InferenceEngine::Precision> inputPrecisions = {
+ InferenceEngine::Precision::FP32,
+ InferenceEngine::Precision::U8
+};
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+ InferenceEngine::Precision::FP32,
+ InferenceEngine::Precision::FP16
+};
+
+INSTANTIATE_TEST_CASE_P(ReshapeCheckDynBatch, ReshapeLayerTest,
+ ::testing::Combine(
+ ::testing::Values(true),
+ ::testing::ValuesIn(inputPrecisions),
+ ::testing::ValuesIn(netPrecisions),
+ ::testing::Values(std::vector<size_t>({30, 30, 30, 30})),
+ ::testing::Values(std::vector<size_t>({30, 30, 30, 30})),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU),
+ ::testing::Values(std::map<std::string, std::string>({{CONFIG_KEY(DYN_BATCH_ENABLED), CONFIG_VALUE(YES)}}))),
+ ReshapeLayerTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(ReshapeCheck, ReshapeLayerTest,
+ ::testing::Combine(
+ ::testing::Values(true),
+ ::testing::ValuesIn(inputPrecisions),
+ ::testing::ValuesIn(netPrecisions),
+ ::testing::Values(std::vector<size_t>({10, 10, 10, 10})),
+ ::testing::Values(std::vector<size_t>({10, 0, 100})),
+ ::testing::Values(CommonTestUtils::DEVICE_CPU),
+ ::testing::Values(std::map<std::string, std::string>({}))),
+ ReshapeLayerTest::getTestCaseName);
+} // namespace
\ No newline at end of file
--- /dev/null
+// Copyright (C) 2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "single_layer_tests/softmax.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+using namespace LayerTestsDefinitions;
+
+namespace {
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+ InferenceEngine::Precision::FP32,
+};
+
+const std::vector<InferenceEngine::Precision> inputPrecisions = {
+ InferenceEngine::Precision::FP32,
+};
+
+const std::vector<InferenceEngine::Layout> inputLayouts2D = {
+ InferenceEngine::Layout::NC,
+};
+
+const std::vector<InferenceEngine::SizeVector> inputShapes2D = {
+ InferenceEngine::SizeVector {1, 100},
+};
+
+const std::vector<size_t> axis2D = {
+ 1
+};
+
+const auto params2D = testing::Combine(
+ testing::ValuesIn(netPrecisions),
+ testing::ValuesIn(inputPrecisions),
+ testing::ValuesIn(inputLayouts2D),
+ testing::ValuesIn(inputShapes2D),
+ testing::ValuesIn(axis2D),
+ testing::Values(CommonTestUtils::DEVICE_CPU),
+ testing::Values(std::map<std::string, std::string>())
+);
+
+INSTANTIATE_TEST_CASE_P(
+ SoftMax2D,
+ SoftMaxLayerTest,
+ params2D,
+ SoftMaxLayerTest::getTestCaseName
+);
+
+} // namespace
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
+//
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
set(TARGET_NAME gpuFuncTests)
addIeTargetTest(
- NAME ${TARGET_NAME}
- ROOT ${CMAKE_CURRENT_SOURCE_DIR}
+ NAME
+ ${TARGET_NAME}
+ ROOT
+ ${CMAKE_CURRENT_SOURCE_DIR}
DEPENDENCIES
clDNNPlugin
LINK_LIBRARIES
funcSharedTests
+ ${CLDNN__IOCL_ICD_LIBPATH}
ADD_CPPLINT
LABELS
GPU
-)
\ No newline at end of file
+)
+target_include_directories(${TARGET_NAME} PRIVATE ${CLDNN__IOCL_ICD_INCDIRS})
\ No newline at end of file
--- /dev/null
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <string>
+#include <utility>
+#include <vector>
+#include <memory>
+
+#include <cpp/ie_cnn_net_reader.h>
+#include <inference_engine.hpp>
+#include <ie_compound_blob.h>
+
+#include <cldnn/cldnn_config.hpp>
+
+#ifdef _WIN32
+# include <gpu/gpu_context_api_dx.hpp>
+#elif defined ENABLE_LIBVA
+# include <gpu/gpu_context_api_va.hpp>
+#endif
+#include <gpu/gpu_context_api_ocl.hpp>
+#include <common_test_utils/test_common.hpp>
+#include <functional_test_utils/plugin_cache.hpp>
+
+#include "ngraph_functions/subgraph_builders.hpp"
+#include "functional_test_utils/blob_utils.hpp"
+
+using namespace ::testing;
+using namespace InferenceEngine;
+using namespace InferenceEngine::gpu;
+
+struct OpenCL {
+ cl::Context _context;
+ cl::Device _device;
+ cl::CommandQueue _queue;
+
+ explicit OpenCL(std::shared_ptr<std::vector<cl_context_properties>> media_api_context_properties = nullptr) {
+ // get Intel iGPU OCL device, create context and queue
+ {
+ const unsigned int refVendorID = 0x8086;
+ cl_uint n = 0;
+ cl_int err = clGetPlatformIDs(0, NULL, &n);
+
+ // Get platform list
+ std::vector<cl_platform_id> platform_ids(n);
+ err = clGetPlatformIDs(n, platform_ids.data(), NULL);
+
+ for (auto& id : platform_ids) {
+ cl::Platform platform = cl::Platform(id);
+ std::vector<cl::Device> devices;
+ platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
+ for (auto& d : devices) {
+ if (refVendorID == d.getInfo<CL_DEVICE_VENDOR_ID>()) {
+ _device = d;
+ _context = cl::Context(_device);
+ break;
+ }
+ }
+ }
+ cl_command_queue_properties props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
+ _queue = cl::CommandQueue(_context, _device, props);
+ }
+ }
+
+ explicit OpenCL(cl_context context) {
+ // user-supplied context handle
+ _context = cl::Context(context, true);
+ _device = cl::Device(_context.getInfo<CL_CONTEXT_DEVICES>()[0].get(), true);
+
+ cl_command_queue_properties props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
+ _queue = cl::CommandQueue(_context, _device, props);
+ }
+};
+
+class RemoteBlob_Test : public CommonTestUtils::TestsCommon {
+protected:
+ std::shared_ptr<ngraph::Function> fn_ptr;
+ virtual void SetUp() {
+ fn_ptr = ngraph::builder::subgraph::makeSplitMultiConvConcat();
+ }
+};
+
+TEST_F(RemoteBlob_Test, canInputUserBlob) {
+#if defined(_WIN32) || defined(ANDROID)
+ GTEST_SKIP();
+#endif
+ CNNNetwork net(fn_ptr);
+
+ net.getInputsInfo().begin()->second->setLayout(Layout::NCHW);
+ net.getInputsInfo().begin()->second->setPrecision(Precision::U8);
+
+ auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
+ // TODO: Issue: investigate issue with IECore
+ auto ie = InferenceEngine::Core();
+ auto exec_net = ie.LoadNetwork(net, CommonTestUtils::DEVICE_GPU);
+
+ // regular inference
+ auto inf_req_regular = exec_net.CreateInferRequest();
+ InferenceEngine::Blob::Ptr fakeImageData = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
+ inf_req_regular.SetBlob(net.getInputsInfo().begin()->first, fakeImageData);
+
+ inf_req_regular.Infer();
+ auto outputBlob_regular = inf_req_regular.GetBlob(net.getOutputsInfo().begin()->first);
+
+ // inference using remote blob
+ auto inf_req_shared = exec_net.CreateInferRequest();
+ auto cldnn_context = exec_net.GetContext();
+ cl_context ctx = std::dynamic_pointer_cast<ClContext>(cldnn_context)->get();
+ auto ocl_instance = std::make_shared<OpenCL>(ctx);
+ cl_int err;
+
+ auto dims = net.getInputsInfo().begin()->second->getTensorDesc().getDims();
+ size_t imSize = dims[1] * dims[2] * dims[3];
+
+ cl::Buffer shared_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, imSize, NULL, &err);
+ {
+ void* buffer = fakeImageData->buffer();
+ ocl_instance->_queue.enqueueWriteBuffer(shared_buffer, true, 0, imSize, buffer);
+ }
+
+ Blob::Ptr shared_blob = make_shared_blob(net.getInputsInfo().begin()->second->getTensorDesc(), cldnn_context, shared_buffer);
+ inf_req_shared.SetBlob(net.getInputsInfo().begin()->first, shared_blob);
+
+ inf_req_shared.Infer();
+ auto outputBlob_shared = inf_req_shared.GetBlob(net.getOutputsInfo().begin()->first);
+
+ // compare results
+ {
+ ASSERT_EQ(net.getOutputsInfo().begin()->second->getPrecision(), InferenceEngine::Precision::FP32);
+ ASSERT_EQ(outputBlob_regular->size(), outputBlob_shared->size());
+ auto thr = FuncTestUtils::GetComparisonThreshold(InferenceEngine::Precision::FP32);
+ FuncTestUtils::compareBlobs(outputBlob_regular, outputBlob_shared, thr);
+ }
+}
+
+TEST_F(RemoteBlob_Test, canInferOnUserContext) {
+#if defined _WIN32
+ GTEST_SKIP();
+#endif
+ auto fn_ptr = ngraph::builder::subgraph::makeSplitMultiConvConcat();
+ CNNNetwork net(fn_ptr);
+
+ net.getInputsInfo().begin()->second->setLayout(Layout::NCHW);
+ net.getInputsInfo().begin()->second->setPrecision(Precision::U8);
+
+ auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
+
+ auto ie = InferenceEngine::Core();
+ auto exec_net_regular = ie.LoadNetwork(net, CommonTestUtils::DEVICE_GPU);
+
+ // regular inference
+ auto inf_req_regular = exec_net_regular.CreateInferRequest();
+ auto fakeImageData = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
+ inf_req_regular.SetBlob(net.getInputsInfo().begin()->first, fakeImageData);
+
+ inf_req_regular.Infer();
+ auto outputBlob_regular = inf_req_regular.GetBlob(net.getOutputsInfo().begin()->first);
+
+ // inference using remote blob
+ auto ocl_instance = std::make_shared<OpenCL>();
+ auto remote_context = make_shared_context(ie, CommonTestUtils::DEVICE_GPU, ocl_instance->_context.get());
+ auto exec_net_shared = ie.LoadNetwork(net, remote_context);
+ auto inf_req_shared = exec_net_shared.CreateInferRequest();
+ inf_req_shared.SetBlob(net.getInputsInfo().begin()->first, fakeImageData);
+
+ inf_req_shared.Infer();
+ auto outputBlob_shared = inf_req_shared.GetBlob(net.getOutputsInfo().begin()->first);
+
+ // compare results
+ {
+ ASSERT_EQ(net.getOutputsInfo().begin()->second->getPrecision(), InferenceEngine::Precision::FP32);
+ ASSERT_EQ(outputBlob_regular->size(), outputBlob_shared->size());
+ auto thr = FuncTestUtils::GetComparisonThreshold(InferenceEngine::Precision::FP32);
+ FuncTestUtils::compareBlobs(outputBlob_regular, outputBlob_shared, thr);
+ }
+}
+
+class TwoNets_Test : public CommonTestUtils::TestsCommon, public testing::WithParamInterface<size_t> {
+ void SetUp() override {
+ num_streams = this->GetParam();
+ fn_ptrs = {ngraph::builder::subgraph::makeSplitMultiConvConcat(),
+ ngraph::builder::subgraph::makeMultiSingleConv()};
+ };
+public:
+ static std::string getTestCaseName(const testing::TestParamInfo<std::size_t> & obj) {
+ return "num_streams_" + std::to_string(obj.param);
+ }
+protected:
+ size_t num_streams;
+ std::vector<std::shared_ptr<ngraph::Function>> fn_ptrs;
+};
+
+TEST_P(TwoNets_Test, canInferTwoExecNets) {
+ std::vector<InferenceEngine::CNNNetwork> nets;
+ for (auto &fn_ptr : fn_ptrs) {
+ nets.push_back(CNNNetwork(fn_ptr));
+ }
+
+ auto ie = InferenceEngine::Core();
+
+ std::vector<std::string> outputs;
+ std::vector<InferRequest> irs;
+ std::vector<std::shared_ptr<float*>> ref;
+ std::vector<int> outElementsCount;
+
+ for (size_t i = 0; i < nets.size(); ++i) {
+ auto net = nets[i];
+
+ net.getInputsInfo().begin()->second->setLayout(Layout::NCHW);
+ net.getInputsInfo().begin()->second->setPrecision(Precision::FP32);
+
+ auto exec_net = ie.LoadNetwork(net, CommonTestUtils::DEVICE_GPU,
+ {{PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS, std::to_string(num_streams)}});
+
+ for (int j = 0; j < num_streams; j++) {
+ outputs.push_back(net.getOutputsInfo().begin()->first);
+
+ auto inf_req = exec_net.CreateInferRequest();
+ irs.push_back(inf_req);
+
+ auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
+ inf_req.SetBlob(net.getInputsInfo().begin()->first, blob);
+
+ outElementsCount.push_back(std::accumulate(begin(fn_ptrs[i]->get_output_shape(0)), end(fn_ptrs[i]->get_output_shape(0)), 1,
+ std::multiplies<size_t>()));
+
+ std::shared_ptr<float*> reOutData = ngraph::helpers::inferFnWithInterp<ngraph::element::Type_t::f32>(
+ fn_ptrs[i], {inf_req.GetBlob(net.getInputsInfo().begin()->first)->buffer()}).front();
+ ref.push_back(reOutData);
+ }
+ }
+
+ const int niter = 10;
+ for (int i = 0; i < niter; i++) {
+ for (auto ir : irs) {
+ ir.StartAsync();
+ }
+
+ for (auto ir : irs) {
+ ir.Wait(IInferRequest::RESULT_READY);
+ }
+ }
+
+ for (auto& net : nets) {
+ ASSERT_EQ(net.getOutputsInfo().begin()->second->getPrecision(), InferenceEngine::Precision::FP32);
+ }
+ auto thr = FuncTestUtils::GetComparisonThreshold(InferenceEngine::Precision::FP32);
+ for (size_t i = 0; i < irs.size(); ++i) {
+ ASSERT_EQ(outElementsCount[i], irs[i].GetBlob(outputs[i])->size());
+ FuncTestUtils::compareRawBuffers(irs[i].GetBlob(outputs[i])->buffer().as<float*>(), *ref[i], outElementsCount[i],
+ outElementsCount[i],
+ thr);
+ }
+}
+
+const std::vector<size_t> num_strems{1, 2};
+
+INSTANTIATE_TEST_CASE_P(RemoteBlob, TwoNets_Test, ::testing::ValuesIn(num_strems), TwoNets_Test::getTestCaseName);
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
+//
// SPDX-License-Identifier: Apache-2.0
//
--- /dev/null
+// Copyright (C) 2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+#include "single_layer_tests/reshape.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+using namespace LayerTestsDefinitions;
+
+namespace {
+// Common params
+const std::vector<InferenceEngine::Precision> inputPrecisions = {
+ InferenceEngine::Precision::FP32,
+ InferenceEngine::Precision::U8
+};
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+ InferenceEngine::Precision::FP32,
+ InferenceEngine::Precision::FP16
+};
+
+//TODO: Issue : - 28981
+INSTANTIATE_TEST_CASE_P(DISABLE_ReshapeCheckDynBatch, ReshapeLayerTest,
+ ::testing::Combine(
+ ::testing::Values(true),
+ ::testing::ValuesIn(inputPrecisions),
+ ::testing::ValuesIn(netPrecisions),
+ ::testing::Values(std::vector<size_t>({1, 16, 16, 16})),
+ ::testing::Values(std::vector<size_t>({1, 0, 256})),
+ ::testing::Values(CommonTestUtils::DEVICE_GPU),
+ ::testing::Values(std::map<std::string, std::string>({{CONFIG_KEY(DYN_BATCH_ENABLED), CONFIG_VALUE(YES)}}))),
+ ReshapeLayerTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(ReshapeCheck, ReshapeLayerTest,
+ ::testing::Combine(
+ ::testing::Values(true),
+ ::testing::ValuesIn(inputPrecisions),
+ ::testing::ValuesIn(netPrecisions),
+ ::testing::Values(std::vector<size_t>({10, 10, 10, 10})),
+ ::testing::Values(std::vector<size_t>({10, 0, 100})),
+ ::testing::Values(CommonTestUtils::DEVICE_GPU),
+ ::testing::Values(std::map<std::string, std::string>({}))),
+ ReshapeLayerTest::getTestCaseName);
+} // namespace
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "single_layer_tests/strided_slice.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+using namespace LayerTestsDefinitions;
+
+namespace {
+
+stridedSliceParamsTuple ss_only_test_cases[] = {
+ stridedSliceParamsTuple({ 2, 2, 2, 2 }, { 0, 0, 0, 0 }, { 2, 2, 2, 2 }, { 1, 1, 1, 1 },
+ {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1},
+ InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP32,
+ CommonTestUtils::DEVICE_GPU),
+ stridedSliceParamsTuple({ 2, 2, 2, 2 }, { 1, 1, 1, 1 }, { 2, 2, 2, 2 }, { 1, 1, 1, 1 },
+ {0, 0, 0, 0}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1},
+ InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP32,
+ CommonTestUtils::DEVICE_GPU),
+ stridedSliceParamsTuple({ 2, 2, 2, 2 }, { 1, 1, 1, 1 }, { 2, 2, 2, 2 }, { 1, 1, 1, 1 },
+ {0, 0, 0, 0}, {0, 0, 0, 0}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1},
+ InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP32,
+ CommonTestUtils::DEVICE_GPU),
+ stridedSliceParamsTuple({ 2, 2, 4, 3 }, { 0, 0, 0, 0 }, { 2, 2, 4, 3 }, { 1, 1, 2, 1 },
+ {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1},
+ InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP32,
+ CommonTestUtils::DEVICE_GPU),
+ stridedSliceParamsTuple({ 2, 2, 4, 2 }, { 1, 0, 0, 1 }, { 2, 2, 4, 2 }, { 1, 1, 2, 1 },
+ {0, 1, 1, 0}, {1, 1, 0, 0}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1},
+ InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP32,
+ CommonTestUtils::DEVICE_GPU),
+ stridedSliceParamsTuple({ 1, 2, 4, 2 }, { 1, 0, 0, 0 }, { 1, 2, 4, 2 }, { 1, 1, -2, -1 },
+ {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1},
+ InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP32,
+ CommonTestUtils::DEVICE_GPU),
+ stridedSliceParamsTuple({ 2, 2, 4, 2 }, { 1, 0, 0, 0 }, { 1, 2, 4, 2 }, { 1, 1, -2, -1 },
+ {0, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1},
+ InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP32,
+ CommonTestUtils::DEVICE_GPU),
+};
+
+INSTANTIATE_TEST_CASE_P(
+ smoke_CLDNN, StridedSliceLayerTest, ::testing::ValuesIn(ss_only_test_cases),
+ StridedSliceLayerTest::getTestCaseName);
+
+
+} // namespace
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-# Copyright (C) 2019 Intel Corporation
+# Copyright (C) 2019-2020 Intel Corporation
#
# SPDX-License-Identifier: Apache-2.0
#
set(TARGET_NAME myriadFuncTests)
+disable_deprecated_warnings()
+
addIeTargetTest(
NAME ${TARGET_NAME}
ROOT ${CMAKE_CURRENT_SOURCE_DIR}
DEPENDENCIES
myriadPlugin
LINK_LIBRARIES
+ vpu_common_lib
+ vpu_graph_transformer
funcSharedTests
ADD_CPPLINT
LABELS
VPU
MYRIAD
-)
\ No newline at end of file
+)
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "vpu/ngraph/operations/dynamic_shape_resolver.hpp"
+
+#include "ngraph/op/parameter.hpp"
+#include "ngraph/function.hpp"
+
+#include "cpp/ie_cnn_network.h"
+#include "ie_common.h"
+
+#include "common_test_utils/test_common.hpp"
+
+#include <gtest/gtest.h>
+
+namespace {
+
+class DynamicShapeResolverTests : public CommonTestUtils::TestsCommon {
+public:
+ void SetUp() override {
+ const auto tensorType = ngraph::element::f16;
+ const auto shapeType = ngraph::element::u64;
+ const auto tensorShape = std::initializer_list<std::size_t>{1, 800};
+
+ const auto tensor = std::make_shared<ngraph::op::Parameter>(tensorType, ngraph::Shape{tensorShape});
+ const auto shape = std::make_shared<ngraph::op::Parameter>(shapeType, ngraph::Shape{tensorShape.size()});
+ auto dynamicShapeResolver = std::make_shared<ngraph::op::DynamicShapeResolver>(tensor, shape);
+ dynamicShapeResolver->set_friendly_name(s_FriendlyName);
+ const auto function = std::make_shared<ngraph::Function>(ngraph::NodeVector{dynamicShapeResolver}, ngraph::ParameterVector{tensor, shape});
+
+ cnnNetwork = InferenceEngine::CNNNetwork{function};
+ triggerConversionToCNNNetwork();
+ }
+
+protected:
+ InferenceEngine::CNNLayerPtr getDynamicShapeResolverLayer() const {
+ return cnnNetwork.getLayerByName(s_FriendlyName.c_str());
+ }
+ InferenceEngine::CNNNetwork cnnNetwork;
+
+private:
+ void triggerConversionToCNNNetwork() {
+ cnnNetwork.begin();
+ }
+
+ static const std::string s_FriendlyName;
+};
+
+const std::string DynamicShapeResolverTests::s_FriendlyName = "DSR";
+
+TEST_F(DynamicShapeResolverTests, NGraphFunctionCanBeConvertedToCNNNetwork) {
+ ASSERT_EQ(cnnNetwork.getInputsInfo().size(), 2);
+ ASSERT_EQ(cnnNetwork.layerCount(), cnnNetwork.getInputsInfo().size() + 1);
+ ASSERT_EQ(cnnNetwork.getOutputsInfo().size(), 1);
+
+ const auto dynamicShapeResolver = getDynamicShapeResolverLayer();
+ ASSERT_EQ(dynamicShapeResolver->type, "DynamicShapeResolver");
+ ASSERT_EQ(dynamicShapeResolver->insData.size(), 2);
+ ASSERT_EQ(dynamicShapeResolver->outData.size(), 1);
+}
+
+} // namespace
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <ngraph/op/parameter.hpp>
+#include <ngraph/function.hpp>
+
+#include <gtest/gtest.h>
+#include <common_test_utils/test_common.hpp>
+#include <details/ie_exception.hpp>
+
+#include "vpu/ngraph/operations/dynamic_shape_resolver.hpp"
+
+namespace {
+
+using DataType = ngraph::element::Type_t;
+using DimsType = ngraph::element::Type_t;
+using DataShape = ngraph::Shape;
+
+class DynamicShapeResolverTests : public CommonTestUtils::TestsCommon, public testing::WithParamInterface<std::tuple<DataType, DimsType, DataShape>> {
+public:
+ void SetUp() override {
+ const auto& parameters = GetParam();
+ const auto& dataType = std::get<0>(parameters);
+ const auto& dimsType = std::get<1>(parameters);
+ const auto& dataShape = std::get<2>(parameters);
+
+ data = std::make_shared<ngraph::op::Parameter>(dataType, dataShape);
+ dims = std::make_shared<ngraph::op::Parameter>(dimsType, ngraph::Shape{dataShape.size()});
+ }
+
+protected:
+ std::shared_ptr<ngraph::op::Parameter> data;
+ std::shared_ptr<ngraph::op::Parameter> dims;
+};
+
+TEST_P(DynamicShapeResolverTests, CanValidateAndInferTypes) {
+ std::shared_ptr<ngraph::op::DynamicShapeResolver> dynamicShapeResolver;
+ ASSERT_NO_THROW(dynamicShapeResolver = std::make_shared<ngraph::op::DynamicShapeResolver>(data, dims));
+ ASSERT_NO_THROW(std::make_shared<ngraph::Function>(ngraph::NodeVector{dynamicShapeResolver}, ngraph::ParameterVector{data, dims}));
+}
+
+std::set<ngraph::element::Type_t> allNGraphTypes() {
+ return {
+ ngraph::element::dynamic,
+ ngraph::element::boolean,
+ ngraph::element::bf16,
+ ngraph::element::f16,
+ ngraph::element::f32,
+ ngraph::element::f64,
+ ngraph::element::i8,
+ ngraph::element::i16,
+ ngraph::element::i32,
+ ngraph::element::i64,
+ ngraph::element::u1,
+ ngraph::element::u8,
+ ngraph::element::u16,
+ ngraph::element::u32,
+ ngraph::element::u64
+ };
+}
+
+std::set<ngraph::element::Type_t> allNGraphIntegralNumberTypes() {
+ return {
+ ngraph::element::i8,
+ ngraph::element::i16,
+ ngraph::element::i32,
+ ngraph::element::i64,
+ ngraph::element::u1,
+ ngraph::element::u8,
+ ngraph::element::u16,
+ ngraph::element::u32,
+ ngraph::element::u64
+ };
+}
+
+INSTANTIATE_TEST_CASE_P(NGraph, DynamicShapeResolverTests, testing::Combine(
+ testing::ValuesIn(allNGraphTypes()),
+ testing::ValuesIn(allNGraphIntegralNumberTypes()),
+ testing::Values(DataShape{1, 800}, DataShape{1, 1})));
+
+
+using DataPartialShape = ngraph::PartialShape;
+using DimsPartialShape = ngraph::PartialShape;
+class DynamicShapeResolverNegativeTests
+ : public CommonTestUtils::TestsCommon
+ , public testing::WithParamInterface<std::tuple<DataType, DimsType, DataPartialShape, DimsPartialShape>> {
+public:
+ void SetUp() override {
+ const auto& parameters = GetParam();
+ const auto& dataType = std::get<0>(parameters);
+ const auto& dimsType = std::get<1>(parameters);
+ const auto& dataPartialShape = std::get<2>(parameters);
+ const auto& dimsPartialShape = std::get<3>(parameters);
+
+ data = std::make_shared<ngraph::op::Parameter>(dataType, dataPartialShape);
+ dims = std::make_shared<ngraph::op::Parameter>(dimsType, dimsPartialShape);
+ }
+
+protected:
+ std::shared_ptr<ngraph::op::Parameter> data;
+ std::shared_ptr<ngraph::op::Parameter> dims;
+};
+
+class DynamicShapeResolverNegativeTestsDimsType : public DynamicShapeResolverNegativeTests {};
+TEST_P(DynamicShapeResolverNegativeTestsDimsType, ThrowsOnInvalidDimsType) {
+ ASSERT_THROW(std::make_shared<ngraph::op::DynamicShapeResolver>(data, dims), ngraph::ngraph_error);
+}
+
+std::set<ngraph::element::Type_t> allNGraphNotIntegralTypes() {
+ auto notIntegralTypes = std::set<ngraph::element::Type_t>{};
+ const auto& allTypes = allNGraphTypes();
+ const auto& allIntegralTypes = allNGraphIntegralNumberTypes();
+ std::set_difference(allTypes.cbegin(), allTypes.cend(), allIntegralTypes.cbegin(), allIntegralTypes.cend(),
+ std::inserter(notIntegralTypes, notIntegralTypes.begin()));
+ return notIntegralTypes;
+}
+
+INSTANTIATE_TEST_CASE_P(NGraph, DynamicShapeResolverNegativeTestsDimsType, testing::Combine(
+ testing::ValuesIn(allNGraphTypes()),
+ testing::ValuesIn(allNGraphNotIntegralTypes()),
+ testing::Values(DataPartialShape{1, 800}),
+ testing::Values(DataPartialShape{2})));
+
+class DynamicShapeResolverNegativeTestsDataShape : public DynamicShapeResolverNegativeTests {};
+TEST_P(DynamicShapeResolverNegativeTestsDataShape, ThrowsOnInvalidDimsType) {
+ ASSERT_THROW(std::make_shared<ngraph::op::DynamicShapeResolver>(data, dims), ngraph::ngraph_error);
+}
+
+INSTANTIATE_TEST_CASE_P(NGraph, DynamicShapeResolverNegativeTestsDataShape, testing::Combine(
+ testing::ValuesIn(allNGraphTypes()),
+ testing::ValuesIn(allNGraphIntegralNumberTypes()),
+ testing::Values(
+ DataPartialShape::dynamic(),
+ DataPartialShape{{1, ngraph::Dimension::dynamic()}},
+ DataPartialShape{{ngraph::Dimension::dynamic(), 1}},
+ DataPartialShape{{ngraph::Dimension::dynamic(), ngraph::Dimension::dynamic()}}),
+ testing::Values(DataShape{2})));
+
+class DynamicShapeResolverNegativeTestsDimsShape : public DynamicShapeResolverNegativeTests {};
+TEST_P(DynamicShapeResolverNegativeTestsDimsShape, ThrowsOnInvalidDimsType) {
+ ASSERT_THROW(std::make_shared<ngraph::op::DynamicShapeResolver>(data, dims), ngraph::ngraph_error);
+}
+
+INSTANTIATE_TEST_CASE_P(NGraph, DynamicShapeResolverNegativeTestsDimsShape, testing::Combine(
+ testing::ValuesIn(allNGraphTypes()),
+ testing::ValuesIn(allNGraphIntegralNumberTypes()),
+ testing::Values(DataShape{1, 800}),
+ testing::Values(
+ DataPartialShape::dynamic(),
+ DataPartialShape{{1, ngraph::Dimension::dynamic()}},
+ DataPartialShape{{ngraph::Dimension::dynamic(), 1}},
+ DataPartialShape{{ngraph::Dimension::dynamic(), ngraph::Dimension::dynamic()}},
+ DataPartialShape{0},
+ DataPartialShape{1},
+ DataPartialShape{3})));
+
+} // namespace
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "vpu/ngraph/operations/static_shape_nonzero.hpp"
+
+#include <common_test_utils/test_common.hpp>
+
+#include <ngraph/op/parameter.hpp>
+#include <ngraph/function.hpp>
+
+#include <details/ie_exception.hpp>
+
+#include <gtest/gtest.h>
+
+namespace {
+
+using TensorType = ngraph::element::Type;
+using TensorShape = ngraph::PartialShape;
+
+class StaticShapeNonZeroTests
+ : public CommonTestUtils::TestsCommon,
+ public testing::WithParamInterface<std::tuple<TensorType, TensorShape>> {
+public:
+ void SetUp() override {
+ const auto& parameters = GetParam();
+ const auto& tensorType = std::get<0>(parameters);
+ const auto& tensorShape = std::get<1>(parameters);
+
+ m_param = std::make_shared<ngraph::op::Parameter>(tensorType, tensorShape);
+ }
+protected:
+ std::shared_ptr<ngraph::op::Parameter> m_param;
+};
+
+std::vector<ngraph::PartialShape> testStaticShapes {
+ TensorShape{1000},
+ TensorShape{4, 1000},
+ TensorShape{3, 128, 256},
+ TensorShape{2, 3, 128, 256},
+};
+
+std::vector<ngraph::PartialShape> testDynamicShapes {
+ TensorShape{ngraph::Dimension::dynamic()},
+ TensorShape{4, ngraph::Dimension::dynamic()},
+ TensorShape{3, ngraph::Dimension::dynamic(), 256},
+};
+
+std::vector<ngraph::element::Type> testNGraphNumericTypes {
+ ngraph::element::dynamic,
+ ngraph::element::bf16,
+ ngraph::element::f16,
+ ngraph::element::f32,
+ ngraph::element::f64,
+ ngraph::element::i8,
+ ngraph::element::i16,
+ ngraph::element::i32,
+ ngraph::element::i64,
+ ngraph::element::u1,
+ ngraph::element::u8,
+ ngraph::element::u16,
+ ngraph::element::u32,
+ ngraph::element::u64,
+};
+
+//
+// Positive tests
+//
+
+TEST_P(StaticShapeNonZeroTests, CanValidateAndInferTypes) {
+ std::shared_ptr<ngraph::op::StaticShapeNonZero> op;
+ ASSERT_NO_THROW(op = std::make_shared<ngraph::op::StaticShapeNonZero>(m_param));
+ ASSERT_NO_THROW(std::make_shared<ngraph::Function>(
+ ngraph::OutputVector{op->output(0), op->output(1)},
+ ngraph::ParameterVector{m_param}));
+}
+
+INSTANTIATE_TEST_CASE_P(NGraph, StaticShapeNonZeroTests, testing::Combine(
+ testing::ValuesIn(testNGraphNumericTypes),
+ testing::ValuesIn(testStaticShapes))
+);
+
+//
+// Negative tests
+//
+
+class StaticShapeNonZeroTestsNegativeDataType : public StaticShapeNonZeroTests {};
+TEST_P(StaticShapeNonZeroTestsNegativeDataType, ThrowsOnInvalidDataType) {
+ std::shared_ptr<ngraph::op::StaticShapeNonZero> op;
+ ASSERT_THROW(op = std::make_shared<ngraph::op::StaticShapeNonZero>(m_param),
+ ngraph::NodeValidationFailure);
+}
+
+INSTANTIATE_TEST_CASE_P(NGraph, StaticShapeNonZeroTestsNegativeDataType, testing::Combine(
+ testing::Values(ngraph::element::boolean),
+ testing::ValuesIn(testStaticShapes))
+);
+
+class StaticShapeNonZeroTestsNegativeDataShape : public StaticShapeNonZeroTests {};
+TEST_P(StaticShapeNonZeroTestsNegativeDataShape, ThrowsOnInvalidDataShape) {
+ std::shared_ptr<ngraph::op::StaticShapeNonZero> op;
+ ASSERT_THROW(op = std::make_shared<ngraph::op::StaticShapeNonZero>(m_param),
+ ngraph::NodeValidationFailure);
+}
+
+INSTANTIATE_TEST_CASE_P(NGraph, StaticShapeNonZeroTestsNegativeDataShape, testing::Combine(
+ testing::ValuesIn(testNGraphNumericTypes),
+ testing::ValuesIn(testDynamicShapes))
+);
+
+} // namespace
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "vpu/ngraph/transformations/dynamic_to_static_shape_nonzero.hpp"
+#include "vpu/ngraph/operations/static_shape_nonzero.hpp"
+#include "vpu/ngraph/operations/dynamic_shape_resolver.hpp"
+
+#include "../utils/ngraph_utils.h"
+
+#include <ngraph/function.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/opsets/opset3.hpp>
+
+#include <cpp/ie_cnn_network.h>
+
+#include <common_test_utils/test_common.hpp>
+#include <gtest/gtest.h>
+
+#include <string>
+#include <memory>
+#include <map>
+#include <vector>
+
+namespace {
+
+using TensorType = ngraph::element::Type_t;
+using TensorShape = ngraph::Shape;
+
+class DynamicToStaticShapeNonZeroTests
+ : public CommonTestUtils::TestsCommon,
+ public testing::WithParamInterface<std::tuple<TensorType, TensorShape>> {
+public:
+ void prepareFunctions() {
+ const auto& parameters = GetParam();
+ const auto& tensorType = std::get<0>(parameters);
+ const auto& tensorShape = std::get<1>(parameters);
+
+ // Create a function with only opset3::NonZero
+ // And then run conversion pass
+ {
+ const auto input = std::make_shared<ngraph::op::Parameter>(tensorType, tensorShape);
+
+ const auto nonZero = std::make_shared<ngraph::opset3::NonZero>(input);
+ nonZero->set_friendly_name(s_FriendlyName);
+
+ m_resfunction = std::make_shared<ngraph::Function>(
+ ngraph::NodeVector{nonZero}, ngraph::ParameterVector{input});
+ ngraph::pass::DynamicToStaticShapeNonZero().run_on_function(m_resfunction);
+ }
+
+ // Create a reference function
+ {
+ const auto input = std::make_shared<ngraph::opset1::Parameter>(tensorType, tensorShape);
+
+ const auto staticShapeNonZero = std::make_shared<ngraph::op::StaticShapeNonZero>(input);
+ staticShapeNonZero->set_friendly_name(s_FriendlyName + "/static_shape");
+ const auto dynamicShapeResolver = std::make_shared<ngraph::op::DynamicShapeResolver>(
+ staticShapeNonZero->output(0), staticShapeNonZero->output(1));
+ dynamicShapeResolver->set_friendly_name(s_FriendlyName + "/resolve_shape");
+
+ m_refFunction = std::make_shared<ngraph::Function>(
+ ngraph::NodeVector{dynamicShapeResolver}, ngraph::ParameterVector{input});
+ }
+ }
+
+ void compareFunctions() {
+ FuncTestUtils::CompareFunctions(m_resfunction, m_refFunction);
+
+ auto actualResultNode = m_resfunction->get_output_op(0);
+ auto actualResolverNode = actualResultNode->input(0).get_source_output().get_node_shared_ptr();
+ auto actualNonZeroNode = actualResolverNode->input(0).get_source_output().get_node_shared_ptr();
+
+ auto expectedResultNode = m_refFunction->get_output_op(0);
+ auto expectedResolverNode = expectedResultNode->input(0).get_source_output().get_node_shared_ptr();
+ auto expectedNonZeroNode = expectedResolverNode->input(0).get_source_output().get_node_shared_ptr();
+
+ EXPECT_EQ(actualResolverNode->get_friendly_name(), expectedResolverNode->get_friendly_name());
+ EXPECT_EQ(actualNonZeroNode->get_friendly_name(), expectedNonZeroNode->get_friendly_name());
+ }
+
+protected:
+ std::shared_ptr<ngraph::Function> m_resfunction;
+ std::shared_ptr<ngraph::Function> m_refFunction;
+
+ static const std::string s_FriendlyName;
+};
+
+const std::string DynamicToStaticShapeNonZeroTests::s_FriendlyName = "non_zero";
+
+TEST_P(DynamicToStaticShapeNonZeroTests, inferAndValidate) {
+ prepareFunctions();
+ compareFunctions();
+}
+
+INSTANTIATE_TEST_CASE_P(NGraph, DynamicToStaticShapeNonZeroTests, testing::Combine(
+ testing::Values(
+ ngraph::element::f16,
+ ngraph::element::f32,
+ ngraph::element::i32,
+ ngraph::element::i64,
+ ngraph::element::u8),
+ testing::Values(
+ TensorShape{1000},
+ TensorShape{4, 1000},
+ TensorShape{3, 128, 256},
+ TensorShape{2, 3, 128, 256})
+));
+
+} // namespace
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/function.hpp>
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <queue>
+#include <string>
+
+namespace FuncTestUtils {
+
+using ComparingNodesPair = typename std::pair<std::shared_ptr<ngraph::Node>, std::shared_ptr<ngraph::Node>>;
+using ComparingNodesBFSQueue = typename std::queue<ComparingNodesPair>;
+
+//
+// This function compares two nGraph functions and requires them to have exactly one output
+// Check nodes types
+// Check number of inputs
+// Check shapes of each Node
+//
+void CompareFunctions(const std::shared_ptr<ngraph::Function>& fActual,
+ const std::shared_ptr<ngraph::Function>& fExpected) {
+ const auto fActualResults = fActual->get_results();
+ const auto fExpectedResults = fExpected->get_results();
+
+ ASSERT_EQ(fActualResults.size(), 1);
+ ASSERT_EQ(fExpectedResults.size(), 1);
+
+ const auto typeInfoToStr = [](const ngraph::Node::type_info_t& typeInfo) {
+ return std::string(typeInfo.name) + "/" + std::to_string(typeInfo.version);
+ };
+
+ ComparingNodesBFSQueue comparingNodes;
+ comparingNodes.push({fActualResults[0], fExpectedResults[0]});
+ while (!comparingNodes.empty()) {
+ const auto node1 = comparingNodes.front().first;
+ const auto node2 = comparingNodes.front().second;
+ comparingNodes.pop();
+
+ ASSERT_EQ(node1->get_type_info(), node2->get_type_info())
+ << "Functions compare: data types must be equal "
+ << typeInfoToStr(node1->get_type_info()) << " != "
+ << typeInfoToStr(node2->get_type_info());
+
+ ASSERT_EQ(node1->inputs().size(), node2->inputs().size())
+ << "Functions compare: numbers of inputs are different: "
+ << node1->inputs().size() << " and " << node2->inputs().size();
+
+ for (int i = 0; i < node1->inputs().size(); ++i) {
+ const auto partialShape1 = node1->input(i).get_partial_shape();
+ const auto partialShape2 = node2->input(i).get_partial_shape();
+ ASSERT_TRUE(partialShape1.relaxes(partialShape2) && partialShape1.refines(partialShape2))
+ << "Functions compare: Different shape detected "
+ << partialShape1 << " and " << partialShape2;
+
+ comparingNodes.push({node1->input_value(i).get_node_shared_ptr(),
+ node2->input_value(i).get_node_shared_ptr()});
+ }
+ }
+}
+
+} // namespace FuncTestUtils
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "single_layer_tests/nonzero.hpp"
+
+#include "common_test_utils/test_constants.hpp"
+#include <vpu/vpu_plugin_config.hpp>
+#include <vpu/private_plugin_config.hpp>
+
+#include <vector>
+
+using namespace ngraph::helpers;
+using namespace LayerTestsDefinitions;
+
+namespace {
+
+std::vector<std::vector<size_t>> inShapes = {
+ {1000},
+ {4, 1000},
+ {2, 4, 1000},
+};
+
+const std::vector<InferenceEngine::Precision> inputPrecisions = {
+ InferenceEngine::Precision::I32,
+ InferenceEngine::Precision::FP16,
+ InferenceEngine::Precision::U8,
+};
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+ InferenceEngine::Precision::FP16
+};
+
+// Enable this when #-29056 is ready
+INSTANTIATE_TEST_CASE_P(DISABLED_nonzero, NonZeroLayerTest,
+ ::testing::Combine(
+ ::testing::ValuesIn(inShapes),
+ ::testing::ValuesIn(inputPrecisions),
+ ::testing::ValuesIn(netPrecisions),
+ ::testing::Values(CommonTestUtils::DEVICE_MYRIAD),
+ ::testing::Values(ConfigMap({{VPU_CONFIG_KEY(DETECT_NETWORK_BATCH), CONFIG_VALUE(NO)}}))),
+ NonZeroLayerTest::getTestCaseName);
+} // namespace
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
+//
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
protected:
void SetUp() override;
+ void TearDown() override;
};
} // namespace LayerTestsDefinitions
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
+//
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
InferenceEngine::Precision,
InferenceEngine::Precision,
InferenceEngine::SizeVector,
- std::string> convLayerTestParamsSet;
+ LayerTestsUtils::TargetDevice> convLayerTestParamsSet;
namespace LayerTestsDefinitions {
-class ConvolutionLayerTest
- : public LayerTestsUtils::LayerTestsCommonClass<convLayerTestParamsSet> {
+class ConvolutionLayerTest : public testing::WithParamInterface<convLayerTestParamsSet>, public LayerTestsUtils::FuncTestsCommon {
public:
static std::string getTestCaseName(testing::TestParamInfo<convLayerTestParamsSet> obj);
void SetUp() override;
};
-} // namespace LayerTestsDefinitions
\ No newline at end of file
+} // namespace LayerTestsDefinitions
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "functional_test_utils/layer_test_utils.hpp"
+
+#include "ngraph_functions/builders.hpp"
+#include "ngraph_functions/utils/ngraph_helpers.hpp"
+
+#include <tuple>
+#include <string>
+#include <vector>
+#include <memory>
+
+namespace LayerTestsDefinitions {
+
+using ConfigMap = typename std::map<std::string, std::string>;
+
+using NonZeroLayerTestParamsSet = typename std::tuple<
+ InferenceEngine::SizeVector, // Input shapes
+ InferenceEngine::Precision, // Input precision
+ InferenceEngine::Precision, // Network precision
+ std::string, // Device name
+ ConfigMap>; // Config map
+
+class NonZeroLayerTest
+ : public LayerTestsUtils::LayerTestsCommonClass<NonZeroLayerTestParamsSet> {
+public:
+ static std::string getTestCaseName(testing::TestParamInfo<NonZeroLayerTestParamsSet> obj);
+
+protected:
+ void SetUp() override;
+};
+
+} // namespace LayerTestsDefinitions
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
+//
// SPDX-License-Identifier: Apache-2.0
//
--- /dev/null
+// Copyright (C) 2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <tuple>
+#include <string>
+#include <vector>
+#include <memory>
+#include "ngraph_functions/builders.hpp"
+#include "ngraph_functions/utils/ngraph_helpers.hpp"
+
+#include "functional_test_utils/layer_test_utils.hpp"
+
+namespace LayerTestsDefinitions {
+ typedef std::tuple<
+ bool, // SpecialZero
+ InferenceEngine::Precision, // Input precision
+ InferenceEngine::Precision, // Network precision
+ std::vector<size_t>, // Input shapes
+ std::vector<size_t>, // OutForm Shapes
+ std::string, // Device name
+ std::map<std::string, std::string> // Config
+ > reshapeParams;
+
+class ReshapeLayerTest
+ : public LayerTestsUtils::LayerTestsCommonClass<reshapeParams> {
+public:
+ static std::string getTestCaseName(testing::TestParamInfo<reshapeParams> obj);
+protected:
+ void SetUp() override;
+};
+
+} // namespace LayerTestsDefinitions
\ No newline at end of file
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "functional_test_utils/layer_test_utils.hpp"
+#include "ngraph_functions/builders.hpp"
+#include "ngraph_functions/utils/ngraph_helpers.hpp"
+
+namespace LayerTestsDefinitions {
+
+using softMaxLayerTestParams =
+ std::tuple<
+ InferenceEngine::Precision, // netPrecision
+ InferenceEngine::Precision, // inputPrecision
+ InferenceEngine::Layout, // inputLayout
+ InferenceEngine::SizeVector, // inputShape
+ size_t, // axis
+ std::string, // targetDevice
+ std::map<std::string, std::string> // config
+ >;
+
+class SoftMaxLayerTest :
+ public LayerTestsUtils::LayerTestsCommonClass<softMaxLayerTestParams> {
+public:
+ static std::string getTestCaseName(testing::TestParamInfo<softMaxLayerTestParams> obj);
+
+protected:
+ void SetUp() override;
+};
+
+} // namespace LayerTestsDefinitions
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "functional_test_utils/layer_test_utils.hpp"
+
+namespace LayerTestsDefinitions {
+
+using stridedSliceParamsTuple = typename std::tuple<
+ InferenceEngine::SizeVector, // Input shape
+ std::vector<int64_t>, // Begin
+ std::vector<int64_t>, // End
+ std::vector<int64_t>, // Stride
+ std::vector<int64_t>, // Begin mask
+ std::vector<int64_t>, // End mask
+ std::vector<int64_t>, // New axis mask
+ std::vector<int64_t>, // Shrink axis mask
+ std::vector<int64_t>, // Ellipsis axis mask
+ InferenceEngine::Precision, // Input precision
+ InferenceEngine::Precision, // Network precision
+ std::string>; // Device name>;
+
+class StridedSliceLayerTest : public LayerTestsUtils::LayerTestsCommonClass<stridedSliceParamsTuple> {
+public:
+ static std::string getTestCaseName(const testing::TestParamInfo<stridedSliceParamsTuple> &obj);
+
+protected:
+ void SetUp() override;
+};
+} // namespace LayerTestsDefinitions
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
+//
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
fnPtr = std::make_shared<ngraph::Function>(results, params, "SplitConvConcat");
}
+void ExecGraphUniqueNodeNames::TearDown() {
+ if (targetDevice.find(CommonTestUtils::DEVICE_GPU) != std::string::npos) {
+ PluginCache::get().reset();
+ }
+}
+
TEST_P(ExecGraphUniqueNodeNames, CheckUniqueNodeNames) {
InferenceEngine::CNNNetwork cnnNet(fnPtr);
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
+//
// SPDX-License-Identifier: Apache-2.0
//
// Parameter->Activation->Output
ASSERT_EQ(net.layerCount(), 3);
} else if (device == "GPU") {
- // Parameter--->ScaleShift-------------->Eltwise-->Result
+ // Parameter--->ScaleShift-------------->Eltwise
// `-->ScaleShift->ScaleShift-`
- ASSERT_EQ(net.layerCount(), 6);
+ ASSERT_EQ(net.layerCount(), 5);
}
}
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
+//
// SPDX-License-Identifier: Apache-2.0
//
outElementsCount,
thr);
fnPtr.reset();
+ if (targetDevice.find(CommonTestUtils::DEVICE_GPU) != std::string::npos) {
+ PluginCache::get().reset();
+ }
}
} // namespace LayerTestsDefinitions
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
void ConvolutionLayerTest::SetUp() {
convSpecificParams convParams;
std::vector<size_t> inputShape;
+ auto inputPrecision = InferenceEngine::Precision::UNSPECIFIED;
+ auto netPrecision = InferenceEngine::Precision::UNSPECIFIED;
std::tie(convParams, inputPrecision, netPrecision, inputShape, targetDevice) = this->GetParam();
ngraph::op::PadType padType;
InferenceEngine::SizeVector kernel, stride, dilation;
ngraph::builder::makeConvolution(paramOuts[0], ngPrc, kernel, stride, padBegin,
padEnd, dilation, padType, convOutChannels));
ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(conv)};
- fnPtr = std::make_shared<ngraph::Function>(results, params, "convolution");
+ function = std::make_shared<ngraph::Function>(results, params, "convolution");
}
TEST_P(ConvolutionLayerTest, CompareWithRefs) {
- inferAndValidate();
+ Run();
+
+ if (targetDevice == std::string{CommonTestUtils::DEVICE_GPU}) {
+ PluginCache::get().reset();
+ }
}
-} // namespace LayerTestsDefinitions
\ No newline at end of file
+} // namespace LayerTestsDefinitions
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "single_layer_tests/nonzero.hpp"
+
+#include "common_test_utils/common_utils.hpp"
+#include "functional_test_utils/skip_tests_config.hpp"
+#include "functional_test_utils/layer_test_utils.hpp"
+
+#include "ie_core.hpp"
+
+#include <tuple>
+#include <string>
+#include <vector>
+#include <memory>
+
+namespace LayerTestsDefinitions {
+
+std::string NonZeroLayerTest::getTestCaseName(testing::TestParamInfo<NonZeroLayerTestParamsSet> obj) {
+ std::vector<size_t> inputShape;
+ InferenceEngine::Precision inputPrecision, netPrecision;
+ std::string targetDevice;
+ ConfigMap config;
+ std::tie(inputShape, inputPrecision, netPrecision, targetDevice, config) = obj.param;
+
+ std::ostringstream result;
+ result << "IS=" << CommonTestUtils::vec2str(inputShape) << "_";
+ result << "inPRC=" << inputPrecision.name() << "_";
+ result << "netPRC=" << netPrecision.name() << "_";
+ result << "targetDevice=" << targetDevice;
+ return result.str();
+}
+
+void NonZeroLayerTest::SetUp() {
+ std::vector<size_t> inputShape;
+ std::tie(inputShape, inputPrecision, netPrecision, targetDevice, config) = this->GetParam();
+
+ auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+ auto paramNode = std::make_shared<ngraph::opset1::Parameter>(ngPrc, ngraph::Shape(inputShape));
+
+ auto nonZeroOp = std::make_shared<ngraph::opset3::NonZero>(paramNode->output(0));
+
+ ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(nonZeroOp)};
+ fnPtr = std::make_shared<ngraph::Function>(results, ngraph::ParameterVector{paramNode}, "non_zero");
+}
+
+TEST_P(NonZeroLayerTest, CompareWithRefs) {
+ inferAndValidate();
+}
+} // namespace LayerTestsDefinitions
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
+//
// SPDX-License-Identifier: Apache-2.0
//
--- /dev/null
+// Copyright (C) 2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <tuple>
+#include <string>
+#include <vector>
+#include <memory>
+#include <ie_plugin_config.hpp>
+#include <ie_core.hpp>
+#include <functional>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "functional_test_utils/layer_test_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+#include "single_layer_tests/reshape.hpp"
+
+namespace LayerTestsDefinitions {
+ std::string ReshapeLayerTest::getTestCaseName(testing::TestParamInfo<reshapeParams> obj) {
+ InferenceEngine::Precision inputPrecision, netPrecision;
+ InferenceEngine::SizeVector inputShapes, outFormShapes;
+ std::string targetDevice;
+ std::map<std::string, std::string> config;
+ bool specialZero;
+ std::tie(specialZero, inputPrecision, netPrecision, inputShapes, outFormShapes,
+ targetDevice, config) = obj.param;
+ std::ostringstream result;
+ result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_";
+ result << "specialZero=" << specialZero << "_";
+ result << "inPRC=" << inputPrecision.name() << "_";
+ result << "netPRC=" << netPrecision.name() << "_";
+ result << "targetDevice=" << targetDevice;
+ return result.str();
+}
+
+void ReshapeLayerTest::SetUp() {
+ InferenceEngine::SizeVector inputShapes, outFormShapes;
+ bool specialZero;
+ std::tie(specialZero, inputPrecision, netPrecision, inputShapes, outFormShapes,
+ targetDevice, config) = this->GetParam();
+ auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+ auto paramsIn = ngraph::builder::makeParams(ngPrc, {inputShapes});
+ auto paramIn = ngraph::helpers::convert2OutputVector(
+ ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(paramsIn));
+ auto constNode = std::make_shared<ngraph::opset1::Constant>(
+ ngraph::element::Type_t::i64, ngraph::Shape{outFormShapes.size()}, outFormShapes);
+ auto reshape = std::dynamic_pointer_cast<ngraph::opset1::Reshape>(
+ std::make_shared<ngraph::opset1::Reshape>(paramIn[0], constNode, specialZero));
+ ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(reshape)};
+ fnPtr = std::make_shared<ngraph::Function>(results, paramsIn, "Reshape");
+}
+
+TEST_P(ReshapeLayerTest, CompareWithRefsDynamicBath) {
+ inferAndValidate();
+}
+} // namespace LayerTestsDefinitions
\ No newline at end of file
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "single_layer_tests/softmax.hpp"
+
+#include "common_test_utils/common_utils.hpp"
+#include "functional_test_utils/skip_tests_config.hpp"
+#include "functional_test_utils/layer_test_utils.hpp"
+
+#include "ie_core.hpp"
+
+#include "ngraph/op/softmax.hpp"
+
+#include <tuple>
+#include <string>
+#include <vector>
+#include <memory>
+
+namespace LayerTestsDefinitions {
+
+std::string SoftMaxLayerTest::getTestCaseName(testing::TestParamInfo<softMaxLayerTestParams> obj) {
+ InferenceEngine::Precision netPrecision, inputPrecision;
+ InferenceEngine::Layout inputLayout;
+ InferenceEngine::SizeVector inputShape;
+ size_t axis;
+ std::string targetDevice;
+ std::map<std::string, std::string> config;
+ std::tie(netPrecision, inputPrecision, inputLayout, inputShape, axis, targetDevice, config) = obj.param;
+
+ std::ostringstream result;
+ result << "netPRC=" << netPrecision.name() << "_";
+ result << "inPRC=" << inputPrecision.name() << "_";
+ result << "inLayout=" << inputLayout << "_";
+ result << "IS=" << CommonTestUtils::vec2str(inputShape) << "_";
+ result << "axis=" << axis << "_";
+ result << "targetDevice=" << targetDevice;
+
+ return result.str();
+}
+
+void SoftMaxLayerTest::SetUp() {
+ InferenceEngine::SizeVector inputShape;
+ size_t axis;
+ std::tie(netPrecision, inputPrecision, inputLayout, inputShape, axis, targetDevice, config) = GetParam();
+ outputPrecision = inputPrecision;
+ outputLayout = inputLayout;
+
+ const auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+
+ const auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
+
+ const auto paramOuts =
+ ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
+
+ const auto softMax = std::make_shared<ngraph::opset1::Softmax>(paramOuts.at(0), axis);
+
+ const ngraph::ResultVector results {std::make_shared<ngraph::opset1::Result>(softMax)};
+
+ fnPtr = std::make_shared<ngraph::Function>(results, params, "softMax");
+}
+
+TEST_P(SoftMaxLayerTest, CompareWithRefs) {
+ inferAndValidate();
+}
+
+} // namespace LayerTestsDefinitions
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
TEST_P(SpaceToBatchLayerTest, CompareWithRefs) {
inferAndValidate();
-};
+}
} // namespace LayerTestsDefinitions
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include <ie_core.hpp>
+#include <ngraph_functions/builders.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "functional_test_utils/precision_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "single_layer_tests/strided_slice.hpp"
+
+namespace LayerTestsDefinitions {
+
+std::string StridedSliceLayerTest::getTestCaseName(const testing::TestParamInfo<stridedSliceParamsTuple> &obj) {
+ InferenceEngine::SizeVector inputShape;
+ std::vector<int64_t> begin, end, stride;
+ std::vector<int64_t> begin_mask, new_axis_mask, end_mask, shrink_mask, ellipsis_mask;
+ InferenceEngine::Precision inPrc, netPrc;
+ std::string targetName;
+ std::tie(inputShape, begin, end, stride, begin_mask, end_mask, new_axis_mask, shrink_mask, ellipsis_mask, inPrc, netPrc, targetName) = obj.param;
+ std::ostringstream result;
+ result << "inShape=" << CommonTestUtils::vec2str(inputShape) << "_";
+ result << "inPRC=" << inPrc.name() << "_";
+ result << "netPRC=" << netPrc.name() << "_";
+ result << "begin=" << CommonTestUtils::vec2str(begin) << "_";
+ result << "end=" << CommonTestUtils::vec2str(end) << "_";
+ result << "stride=" << CommonTestUtils::vec2str(stride) << "_";
+ result << "begin_m=" << CommonTestUtils::vec2str(begin_mask) << "_";
+ result << "end_m=" << CommonTestUtils::vec2str(end_mask) << "_";
+ result << "new_axis_m=" << CommonTestUtils::vec2str(new_axis_mask) << "_";
+ result << "shrink_m=" << CommonTestUtils::vec2str(shrink_mask) << "_";
+ result << "ellipsis_m=" << CommonTestUtils::vec2str(ellipsis_mask) << "_";
+ result << "targetDevice=" << targetName << "_";
+ return result.str();
+}
+
+void StridedSliceLayerTest::SetUp() {
+ InferenceEngine::SizeVector inputShape;
+ std::vector<int64_t> begin, end, stride;
+ std::vector<int64_t> begin_mask, end_mask, new_axis_mask, shrink_mask, ellipsis_mask;
+ std::tie(inputShape, begin, end, stride, begin_mask, end_mask, new_axis_mask, shrink_mask, ellipsis_mask,
+ inputPrecision, netPrecision, targetDevice) = this->GetParam();
+
+ auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+ auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
+ auto paramOuts = ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
+ auto ss = ngraph::builder::makeStridedSlice(paramOuts[0], begin, end, stride, ngPrc, begin_mask, end_mask, new_axis_mask, shrink_mask, ellipsis_mask);
+ ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(ss)};
+ fnPtr = std::make_shared<ngraph::Function>(results, params, "StridedSlice");
+}
+
+TEST_P(StridedSliceLayerTest, CompareWithRefs) {
+ inferAndValidate();
+}
+
+} // namespace LayerTestsDefinitions
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-
#pragma once
#include <map>
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-
#pragma once
#include <algorithm>
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-
#pragma once
#include <cmath>
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-
#pragma once
#include <fstream>
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-
#pragma once
namespace CommonTestUtils {
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
set(TARGET_NAME funcTestUtils)
-list(APPEND EXPORT_DEPENDENCIES
- commonTestUtils
- inference_engine
- )
+list(APPEND EXPORT_DEPENDENCIES commonTestUtils inference_engine)
addIeTarget(
- NAME ${TARGET_NAME}
- TYPE STATIC
- ROOT ${CMAKE_CURRENT_SOURCE_DIR}
- ADD_CPPLINT
- DEVELOPER_PACKAGE
- EXPORT_DEPENDENCIES
- ${EXPORT_DEPENDENCIES}
+ NAME ${TARGET_NAME}
+ TYPE STATIC
+ ROOT ${CMAKE_CURRENT_SOURCE_DIR}
+ ADD_CPPLINT
+ DEVELOPER_PACKAGE
+ LINK_LIBRARIES ngraphFunctions
+ EXPORT_DEPENDENCIES ${EXPORT_DEPENDENCIES}
)
-target_include_directories(${TARGET_NAME} PUBLIC
- $<TARGET_PROPERTY:inference_engine_plugin_api,INTERFACE_INCLUDE_DIRECTORIES>)
+target_include_directories(${TARGET_NAME} PUBLIC $<TARGET_PROPERTY:inference_engine_plugin_api,INTERFACE_INCLUDE_DIRECTORIES>)
-target_link_libraries(${TARGET_NAME}
- PUBLIC
- ${EXPORT_DEPENDENCIES}
- )
\ No newline at end of file
+target_link_libraries(${TARGET_NAME} PUBLIC ${EXPORT_DEPENDENCIES})
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-
#pragma once
#include <gtest/gtest.h>
#include "blob_factory.hpp"
+#include "blob_transform.hpp"
#include "precision_utils.h"
#include "common_test_utils/data_utils.hpp"
#include "common_test_utils/test_constants.hpp"
blob->allocate();
switch (td.getPrecision()) {
#define CASE(X) case X: CommonTestUtils::fill_data_random<X>(blob, range, start_from, resolution); break;
- CASE(InferenceEngine::Precision::FP32);
- CASE(InferenceEngine::Precision::FP16);
- CASE(InferenceEngine::Precision::U8);
- CASE(InferenceEngine::Precision::U16);
- CASE(InferenceEngine::Precision::I8);
- CASE(InferenceEngine::Precision::I16);
- CASE(InferenceEngine::Precision::I64);
- CASE(InferenceEngine::Precision::BIN);
+ CASE(InferenceEngine::Precision::FP32)
+ CASE(InferenceEngine::Precision::FP16)
+ CASE(InferenceEngine::Precision::U8)
+ CASE(InferenceEngine::Precision::U16)
+ CASE(InferenceEngine::Precision::I8)
+ CASE(InferenceEngine::Precision::I16)
+ CASE(InferenceEngine::Precision::I64)
+ CASE(InferenceEngine::Precision::BIN)
+ CASE(InferenceEngine::Precision::I32)
#undef CASE
default:
THROW_IE_EXCEPTION << "Wrong precision specified: " << td.getPrecision().name();
}
return blob;
}
-} // namespace FuncTestUtils
\ No newline at end of file
+
+InferenceEngine::Blob::Ptr inline convertBlobLayout(const InferenceEngine::Blob::Ptr& in,
+ InferenceEngine::Layout layout) {
+ IE_ASSERT(in != nullptr) << "Got NULL pointer";
+
+ const auto& inDesc = in->getTensorDesc();
+
+ if (inDesc.getLayout() == layout) {
+ return in;
+ }
+
+ const auto outDesc = InferenceEngine::TensorDesc(inDesc.getPrecision(), inDesc.getDims(), layout);
+
+ const auto out = make_blob_with_precision(outDesc);
+ out->allocate();
+
+ InferenceEngine::blob_copy(in, out);
+
+ return out;
+}
+
+} // namespace FuncTestUtils
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "layer_test_utils.hpp"
+
+namespace LayerTestsUtils {
+
+FuncTestsCommon::FuncTestsCommon() {
+ core = PluginCache::get().ie(targetDevice).get();
+}
+
+void FuncTestsCommon::Run() {
+ SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+ Configure();
+ LoadNetwork();
+ Infer();
+ Validate();
+}
+
+FuncTestsCommon::~FuncTestsCommon() {
+ if (!configuration.empty()) {
+ PluginCache::get().reset();
+ }
+}
+
+InferenceEngine::Blob::Ptr FuncTestsCommon::GenerateInput(const InferenceEngine::InputInfo& info) const {
+ return FuncTestUtils::createAndFillBlob(info.getTensorDesc());
+}
+
+void FuncTestsCommon::Compare(const std::vector<std::uint8_t>& expected, const InferenceEngine::Blob::Ptr& actual) {
+ ASSERT_EQ(expected.size(), actual->byteSize());
+ const auto& expectedBuffer = expected.data();
+
+ auto memory = InferenceEngine::as<InferenceEngine::MemoryBlob>(actual);
+ IE_ASSERT(memory);
+ const auto lockedMemory = memory->wmap();
+ const auto actualBuffer = lockedMemory.as<const std::uint8_t*>();
+
+ const auto& precision = actual->getTensorDesc().getPrecision();
+ const auto& size = actual->size();
+ switch (precision) {
+ case InferenceEngine::Precision::FP32:
+ Compare(reinterpret_cast<const float*>(expectedBuffer), reinterpret_cast<const float*>(actualBuffer), size, 1e-2f);
+ break;
+ case InferenceEngine::Precision::I32:
+ Compare(reinterpret_cast<const std::int32_t*>(expectedBuffer), reinterpret_cast<const std::int32_t*>(actualBuffer), size, 0);
+ break;
+ default:
+ FAIL() << "Comparator for " << precision << " precision isn't supported";
+ }
+}
+
+void FuncTestsCommon::Configure() const {
+ if (!configuration.empty()) {
+ core->SetConfig(configuration, targetDevice);
+ }
+}
+
+void FuncTestsCommon::LoadNetwork() {
+ cnnNetwork = InferenceEngine::CNNNetwork{function};
+ executableNetwork = core->LoadNetwork(cnnNetwork, targetDevice);
+ inferRequest = executableNetwork.CreateInferRequest();
+
+ for (const auto& input : cnnNetwork.getInputsInfo()) {
+ const auto& info = input.second;
+
+ auto blob = GenerateInput(*info);
+ inferRequest.SetBlob(info->name(), blob);
+ inputs.push_back(blob);
+ }
+}
+
+void FuncTestsCommon::Infer() {
+ inferRequest.Infer();
+}
+
+std::vector<InferenceEngine::Blob::Ptr> FuncTestsCommon::GetOutputs() {
+ auto outputs = std::vector<InferenceEngine::Blob::Ptr>{};
+ for (const auto& output : cnnNetwork.getOutputsInfo()) {
+ const auto& name = output.first;
+ outputs.push_back(inferRequest.GetBlob(name));
+ }
+ return outputs;
+}
+
+void FuncTestsCommon::Validate() {
+ // nGraph interpreter does not support f16
+ // IE converts f16 to f32
+ ngraph::pass::ConvertPrecision<ngraph::element::Type_t::f16, ngraph::element::Type_t::f32>().run_on_function(function);
+ function->validate_nodes_and_infer_types();
+
+ auto referenceInputs = std::vector<std::vector<std::uint8_t>>(inputs.size());
+ for (std::size_t i = 0; i < inputs.size(); ++i) {
+ const auto& input = inputs[i];
+ const auto& inputSize = input->byteSize();
+
+ auto& referenceInput = referenceInputs[i];
+ referenceInput.resize(inputSize);
+
+ auto memory = InferenceEngine::as<InferenceEngine::MemoryBlob>(input);
+ IE_ASSERT(memory);
+ const auto lockedMemory = memory->wmap();
+ const auto buffer = lockedMemory.as<const std::uint8_t*>();
+ std::copy(buffer, buffer + inputSize, referenceInput.data());
+ }
+
+ const auto& expectedOutputs = ngraph::helpers::interpreterFunction(function, referenceInputs);
+ const auto& actualOutputs = GetOutputs();
+ IE_ASSERT(actualOutputs.size() == expectedOutputs.size())
+ << "nGraph interpreter has " << expectedOutputs.size() << " outputs, while IE " << actualOutputs.size();
+
+ for (std::size_t outputIndex = 0; outputIndex < expectedOutputs.size(); ++outputIndex) {
+ const auto& expected = expectedOutputs[outputIndex];
+ const auto& actual = actualOutputs[outputIndex];
+ Compare(expected, actual);
+ }
+}
+
+} // namespace LayerTestsUtils
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <gtest/gtest.h>
#include <ngraph/node.hpp>
#include <ngraph/function.hpp>
-
+#include <ie_plugin_config.hpp>
#include <ngraph/function.hpp>
#include "common_test_utils/common_utils.hpp"
template<typename paramType>
class LayerTestsCommonClass : public CommonTestUtils::TestsCommon, public testing::WithParamInterface<paramType> {
public:
- InferenceEngine::Precision netPrecision, inputPrecision;
+ InferenceEngine::Precision netPrecision;
+ InferenceEngine::Precision inputPrecision;
+ InferenceEngine::Precision outputPrecision;
+ InferenceEngine::Layout inputLayout;
+ InferenceEngine::Layout outputLayout;
std::string targetDevice;
std::shared_ptr<ngraph::Function> fnPtr;
+ std::map<std::string, std::string> config;
+
+ LayerTestsCommonClass() {
+ netPrecision = InferenceEngine::Precision::UNSPECIFIED;
+ inputPrecision = InferenceEngine::Precision::UNSPECIFIED;
+ outputPrecision = InferenceEngine::Precision::UNSPECIFIED;
+ inputLayout = InferenceEngine::Layout::ANY;
+ outputLayout = InferenceEngine::Layout::ANY;
+ }
void inline inferAndValidate() {
// Skip test according to plugin specific disabledTestPatterns() (if any)
SKIP_IF_CURRENT_TEST_IS_DISABLED()
// Create CNNNetwork from ngrpah::Function
InferenceEngine::CNNNetwork cnnNet(fnPtr);
- // Set target input Precisions for the network
- setNetInOutPrecision(cnnNet, inputPrecision);
+ // Set target input/output Precisions for the network
+ setNetInOutPrecision(cnnNet, inputPrecision, outputPrecision);
+ // Set target input Layouts for the network
+ setNetInOutLayout(cnnNet, inputLayout, outputLayout);
// Get Core from cache
auto ie = PluginCache::get().ie();
+ // Load config
+ if (!config.empty()) {
+ ie->SetConfig(config, targetDevice);
+ }
// Load CNNNetwork to target plugins
auto execNet = ie->LoadNetwork(cnnNet, targetDevice);
// Create InferRequest
// Create input vector with raw data for reference calculation
std::vector<const float *> inRawData;
// References are calculated in float precision, so blobs have to be copied and casted if required
- std::vector<InferenceEngine::Blob::Ptr> castedBlobs = inBlobs;
- for (size_t i = 0; i < castedBlobs.size(); i++) {
- if (inputPrecision != InferenceEngine::Precision::FP32) {
- castedBlobs[i] = FuncTestUtils::copyBlobWithCast<InferenceEngine::Precision::FP32>(inBlobs[i]);
+ std::vector<InferenceEngine::Blob::Ptr> castedBlobs;
+ for (size_t i = 0; i < inBlobs.size(); i++) {
+ const auto precision = inBlobs[i]->getTensorDesc().getPrecision();
+ const auto layout = inBlobs[i]->getTensorDesc().getLayout();
+ const auto defLayout = InferenceEngine::TensorDesc::getLayoutByDims(inBlobs[i]->getTensorDesc().getDims());
+
+ if (precision == InferenceEngine::Precision::FP32 && layout == defLayout) {
+ inRawData.push_back(inBlobs[i]->cbuffer().template as<const float*>());
+ } else {
+ auto castedBlob = FuncTestUtils::copyBlobWithCast<InferenceEngine::Precision::FP32>(inBlobs[i]);
+ castedBlob = FuncTestUtils::convertBlobLayout(castedBlob, defLayout);
+ inRawData.push_back(castedBlob->cbuffer().template as<const float*>());
+ castedBlobs.push_back(castedBlob);
}
- inRawData.push_back(castedBlobs[i]->cbuffer().as<float *>());
}
// Run inference in IE
req.Infer();
-
+ // Reset PluginCash
+ if (!config.empty()) {
+ PluginCache::get().reset();
+ }
// Get output raw data from resulting output blobs
std::vector<float *> outBlobsRawData;
std::vector<size_t> outElementsCount; // output elements count required for compareRawBuffers()
for (const auto &output : cnnNet.getOutputsInfo()) {
auto currentBlob = req.GetBlob(output.first);
- outBlobsRawData.push_back(currentBlob->cbuffer().template as<float *>());
+
outElementsCount.push_back(
- std::accumulate(begin(output.second->getDims()), end(output.second->getDims()), 1,
- std::multiplies<float>()));
+ std::accumulate(
+ std::begin(output.second->getDims()), std::end(output.second->getDims()),
+ size_t {1}, std::multiplies<size_t>()));
+
+ const auto precision = currentBlob->getTensorDesc().getPrecision();
+ const auto layout = currentBlob->getTensorDesc().getLayout();
+ const auto defLayout = InferenceEngine::TensorDesc::getLayoutByDims(currentBlob->getTensorDesc().getDims());
+
+ if (precision == InferenceEngine::Precision::FP32 && layout == defLayout) {
+ outBlobsRawData.push_back(currentBlob->cbuffer().template as<float*>());
+ } else {
+ auto castedBlob = FuncTestUtils::copyBlobWithCast<InferenceEngine::Precision::FP32>(currentBlob);
+ castedBlob = FuncTestUtils::convertBlobLayout(castedBlob, defLayout);
+ outBlobsRawData.push_back(castedBlob->cbuffer().template as<float*>());
+ castedBlobs.push_back(castedBlob);
+ }
}
// Convert initial ngraph::Function to fp32 for references calculation
- convertFuncToF32(fnPtr, netPrecision);;
+ convertFuncToF32(fnPtr, netPrecision);
// Run ngraph Interpreter backend to calculate references
auto refOutData = ngraph::helpers::inferFnWithInterp<ngraph::element::Type_t::f32>(fnPtr, inRawData);
// Compare IE infer results vs ngraph Interpreter reference results
// Deallocate ngraph::Function pointer
fnPtr.reset();
+ if (targetDevice.find(CommonTestUtils::DEVICE_GPU) != std::string::npos) {
+ PluginCache::get().reset();
+ }
}
protected:
- void setNetInOutPrecision(InferenceEngine::CNNNetwork &cnnNet, InferenceEngine::Precision inPrc,
+ static void setNetInOutPrecision(InferenceEngine::CNNNetwork &cnnNet, InferenceEngine::Precision inPrc,
InferenceEngine::Precision outPrc = InferenceEngine::Precision::UNSPECIFIED) {
- for (const auto &inputItem : cnnNet.getInputsInfo()) {
- inputItem.second->setPrecision(inPrc);
+ if (inPrc != InferenceEngine::Precision::UNSPECIFIED) {
+ for (const auto &inputItem : cnnNet.getInputsInfo()) {
+ inputItem.second->setPrecision(inPrc);
+ }
}
if (outPrc != InferenceEngine::Precision::UNSPECIFIED) {
for (const auto &output : cnnNet.getOutputsInfo()) {
}
}
+ static void setNetInOutLayout(InferenceEngine::CNNNetwork& cnnNet, InferenceEngine::Layout inputLayout,
+ InferenceEngine::Layout outputLayout = InferenceEngine::Layout::ANY) {
+ if (inputLayout != InferenceEngine::Layout::ANY) {
+ for (const auto& inputItem : cnnNet.getInputsInfo()) {
+ inputItem.second->setLayout(inputLayout);
+ }
+ }
+ if (outputLayout != InferenceEngine::Layout::ANY) {
+ for (const auto& output : cnnNet.getOutputsInfo()) {
+ output.second->setLayout(outputLayout);
+ }
+ }
+ }
+
void convertFuncToF32(std::shared_ptr<ngraph::Function> fn, InferenceEngine::Precision prc) {
switch (prc) {
case InferenceEngine::Precision::FP32:
return nodes;
}
+using TargetDevice = std::string;
+
+class FuncTestsCommon : public CommonTestUtils::TestsCommon {
+public:
+ virtual InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo& info) const;
+ virtual void Run();
+ virtual void Compare(const std::vector<std::uint8_t>& expected, const InferenceEngine::Blob::Ptr& actual);
+
+protected:
+ FuncTestsCommon();
+ ~FuncTestsCommon() override;
+
+ template<class T>
+ void Compare(const T* expected, const T* actual, std::size_t size, T threshold) {
+ for (std::size_t i = 0; i < size; ++i) {
+ const auto& ref = expected[i];
+ const auto& res = actual[i];
+
+ const auto absoluteDifference = std::abs(res - ref);
+ if (absoluteDifference <= threshold) {
+ continue;
+ }
+
+ const auto max = std::max(std::abs(res), std::abs(ref));
+ ASSERT_TRUE(max != 0 && ((absoluteDifference / max) <= threshold))
+ << "Relative comparison of values expected: " << ref << " and actual: " << res << " at index " << i << " with threshold " << threshold
+ << " failed";
+ }
+ }
+
+ TargetDevice targetDevice;
+ std::shared_ptr<ngraph::Function> function;
+ std::map<std::string, std::string> configuration;
+
+private:
+ void Configure() const;
+ void LoadNetwork();
+ void Infer();
+ std::vector<InferenceEngine::Blob::Ptr> GetOutputs();
+ void Validate();
+
+ InferenceEngine::Core* core = nullptr;
+ InferenceEngine::CNNNetwork cnnNetwork;
+ InferenceEngine::ExecutableNetwork executableNetwork;
+ InferenceEngine::InferRequest inferRequest;
+ std::vector<InferenceEngine::Blob::Ptr> inputs;
+};
+
} // namespace LayerTestsUtils
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
inference_engine_s
inference_engine_preproc_s
inference_engine_lp_transformations
+ inference_engine_ir_readers
gmock)
addIeTarget(
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
inputs[MockNotEmptyICNNNetwork::INPUT_BLOB_NAME] = inputInfo;
};
void addLayer(const CNNLayerPtr& layer) noexcept override {}
+ std::shared_ptr<ngraph::Function> getFunction() noexcept override {
+ return nullptr;
+ }
std::shared_ptr<const ngraph::Function> getFunction() const noexcept override {
return nullptr;
}
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <ngraph/opsets/opset1.hpp>
#include <ngraph/opsets/opset2.hpp>
+#include <ngraph/opsets/opset3.hpp>
#include "ngraph_functions/utils/data_utils.hpp"
const std::vector<size_t> &blockShape,
const std::vector<size_t> &padsBegin,
const std::vector<size_t> &padsEnd);
+
+std::shared_ptr<ngraph::Node> makeStridedSlice(const ngraph::Output<Node> &in,
+ const std::vector<int64_t> &begin,
+ const std::vector<int64_t> &end,
+ const std::vector<int64_t> &stride,
+ const element::Type &type,
+ const std::vector<int64_t> &begin_mask,
+ const std::vector<int64_t> &end_mask,
+ const std::vector<int64_t> &new_axis_mask = std::vector<int64_t>{},
+ const std::vector<int64_t> &shrink_mask = std::vector<int64_t>{},
+ const std::vector<int64_t> &ellipsis_mask = std::vector<int64_t>{});
} // namespace builder
-} // namespace ngraph
\ No newline at end of file
+} // namespace ngraph
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
+//*****************************************************************************
+// Copyright 2017-2020 Intel Corporation
//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
#pragma once
--- /dev/null
+// Cngraph::opyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_precision.hpp>
+#include <functional_test_utils/precision_utils.hpp>
+#include "ngraph_functions/builders.hpp"
+
+namespace ngraph {
+namespace builder {
+namespace subgraph {
+static std::shared_ptr<ngraph::Function> makeSplitConvConcat(std::vector<size_t> inputShape = {1, 4, 20, 20},
+ InferenceEngine::Precision netPrecision = InferenceEngine::Precision::FP32) {
+ auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+ auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
+ auto split = ngraph::builder::makeSplit(params[0], ngPrc, 2, 1);
+
+ auto conv1 = ngraph::builder::makeConvolution(split->output(0), ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+ ngraph::op::PadType::EXPLICIT, 5);
+ auto relu1 = std::make_shared<ngraph::opset1::Relu>(conv1);
+
+ auto conv2 = ngraph::builder::makeConvolution(split->output(1), ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+ ngraph::op::PadType::EXPLICIT, 5);
+ auto relu2 = std::make_shared<ngraph::opset1::Relu>(conv2);
+
+ auto concat = std::make_shared<ngraph::opset1::Concat>(ngraph::OutputVector{relu1->output(0), relu2->output(0)}, 1);
+ ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(concat)};
+ std::shared_ptr<ngraph::Function> fnPtr = std::make_shared<ngraph::Function>(results, params);
+ return fnPtr;
+}
+
+static std::shared_ptr<ngraph::Function> makeSplitMultiConvConcat(std::vector<size_t> inputShape = {1, 4, 20, 20}) {
+ auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(InferenceEngine::Precision::FP32);
+ auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
+ auto split = ngraph::builder::makeSplit(params[0], ngPrc, 2, 1);
+
+ auto conv1_0 = ngraph::builder::makeConvolution(split->output(0), ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+ ngraph::op::PadType::EXPLICIT, 5);
+ auto relu1_0 = std::make_shared<ngraph::opset1::Relu>(conv1_0);
+ auto conv1_1 = ngraph::builder::makeConvolution(relu1_0, ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+ ngraph::op::PadType::EXPLICIT, 5);
+ auto relu1_1 = std::make_shared<ngraph::opset1::Relu>(conv1_1);
+ auto conv1_2 = ngraph::builder::makeConvolution(relu1_1, ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+ ngraph::op::PadType::EXPLICIT, 5);
+ auto relu1_2 = std::make_shared<ngraph::opset1::Relu>(conv1_2);
+ auto conv1_3 = ngraph::builder::makeConvolution(relu1_2, ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+ ngraph::op::PadType::EXPLICIT, 5);
+ auto relu1_3 = std::make_shared<ngraph::opset1::Relu>(conv1_3);
+ auto conv1_4 = ngraph::builder::makeConvolution(relu1_2, ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+ ngraph::op::PadType::EXPLICIT, 5);
+ auto relu1_4 = std::make_shared<ngraph::opset1::Relu>(conv1_4);
+
+ auto conv2_0 = ngraph::builder::makeConvolution(split->output(1), ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+ ngraph::op::PadType::EXPLICIT, 5);
+ auto relu2_0 = std::make_shared<ngraph::opset1::Relu>(conv2_0);
+ auto conv2_1 = ngraph::builder::makeConvolution(relu2_0, ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+ ngraph::op::PadType::EXPLICIT, 5);
+ auto relu2_1 = std::make_shared<ngraph::opset1::Relu>(conv2_1);
+ auto conv2_2 = ngraph::builder::makeConvolution(relu2_1, ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+ ngraph::op::PadType::EXPLICIT, 5);
+ auto relu2_2 = std::make_shared<ngraph::opset1::Relu>(conv2_2);
+ auto conv2_3 = ngraph::builder::makeConvolution(relu2_2, ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+ ngraph::op::PadType::EXPLICIT, 5);
+ auto relu2_3 = std::make_shared<ngraph::opset1::Relu>(conv2_3);
+ auto conv2_4 = ngraph::builder::makeConvolution(relu2_2, ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+ ngraph::op::PadType::EXPLICIT, 5);
+ auto relu2_4 = std::make_shared<ngraph::opset1::Relu>(conv2_4);
+
+ auto concat = std::make_shared<ngraph::opset1::Concat>(ngraph::OutputVector{relu1_4->output(0), relu2_4->output(0)}, 1);
+ ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(concat)};
+ std::shared_ptr<ngraph::Function> fnPtr = std::make_shared<ngraph::Function>(results, params);
+ return fnPtr;
+}
+
+static std::shared_ptr<ngraph::Function>
+makeTIwithLSTMcell(InferenceEngine::Precision prc = InferenceEngine::Precision::FP32) {
+ auto ngPRC = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(prc);
+ // That which we iterate over
+ const size_t N = 32; // Batch size
+ const size_t L = 10; // Sequence length
+ const size_t I = 8; // Input size
+ const size_t H = 32; // Hidden size
+ auto SENT = std::make_shared<ngraph::opset1::Parameter>(ngPRC, ngraph::Shape{N, L, I});
+
+ auto H_init = std::make_shared<ngraph::opset1::Parameter>(ngPRC, ngraph::Shape{N, 1, H});
+ auto C_init = std::make_shared<ngraph::opset1::Parameter>(ngPRC, ngraph::Shape{N, 1, H});
+
+ auto H_t = std::make_shared<ngraph::opset1::Parameter>(ngPRC, ngraph::Shape{N, 1, H});
+ auto C_t = std::make_shared<ngraph::opset1::Parameter>(ngPRC, ngraph::Shape{N, 1, H});
+
+ // Body
+ auto X = std::make_shared<ngraph::opset1::Parameter>(ngPRC, ngraph::Shape{N, 1, I});
+ std::vector<uint64_t> dataW(4 * H * I, 0);
+ auto W_body = std::make_shared<ngraph::opset1::Constant>(ngPRC, ngraph::Shape{4 * H, I}, dataW);
+ std::vector<uint64_t> dataR(4 * H * H, 0);
+ auto R_body = std::make_shared<ngraph::opset1::Constant>(ngPRC, ngraph::Shape{4 * H, H}, dataR);
+ std::vector<uint64_t> inShape = {N, H};
+ auto constantH = std::make_shared<ngraph::opset1::Constant>(ngraph::element::i64, ngraph::Shape{2}, inShape);
+ inShape = {N, I};
+ auto constantX = std::make_shared<ngraph::opset1::Constant>(ngraph::element::i64, ngraph::Shape{2}, inShape);
+ auto LSTM_cell =
+ std::make_shared<ngraph::opset1::LSTMCell>(std::make_shared<ngraph::opset1::Reshape>(X, constantX, false),
+ std::make_shared<ngraph::opset1::Reshape>(H_t, constantH, false),
+ std::make_shared<ngraph::opset1::Reshape>(C_t, constantH, false),
+ W_body,
+ R_body,
+ H);
+ inShape = {N, 1, H};
+ auto constantHo = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{3}, inShape);
+ auto H_o = std::make_shared<ngraph::opset1::Reshape>(LSTM_cell->output(0), constantHo, false);
+ auto C_o = std::make_shared<ngraph::opset1::Reshape>(LSTM_cell->output(1), constantHo, false);
+ auto body = std::make_shared<ngraph::op::TensorIterator::BodyLambda>(
+ ngraph::OutputVector{H_o, C_o}, ngraph::ParameterVector{X, H_t, C_t});
+
+ auto tensor_iterator = std::make_shared<ngraph::op::TensorIterator>();
+ tensor_iterator->set_body(body);
+ // start=0, stride=1, part_size=1, end=39, axis=1
+ tensor_iterator->set_sliced_input(X, SENT, 0, 1, 1, -1, 1);
+ // H_t is Hinit on the first iteration, Ho after that
+ tensor_iterator->set_merged_input(H_t, H_init, H_o);
+ tensor_iterator->set_merged_input(C_t, C_init, C_o);
+
+ // Output 0 is last Ho, result 0 of body
+ auto out0 = tensor_iterator->get_iter_value(H_o, -1);
+ // Output 1 is last Co, result 1 of body
+ auto out1 = tensor_iterator->get_iter_value(C_o, -1);
+
+ auto results = ngraph::ResultVector{std::make_shared<ngraph::opset1::Result>(out0),
+ std::make_shared<ngraph::opset1::Result>(out1)};
+ auto fn_ptr = std::make_shared<ngraph::Function>(results, ngraph::ParameterVector{SENT, H_init, C_init});
+ return fn_ptr;
+}
+
+static std::shared_ptr<ngraph::Function> makeSingleConv(std::vector<size_t> inputShape = {1, 3, 24, 24},
+ InferenceEngine::Precision prc = InferenceEngine::Precision::FP32) {
+ ngraph::element::Type type = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(prc);
+ auto param0 = std::make_shared<ngraph::opset1::Parameter>(type, ngraph::Shape(inputShape));
+ auto conv1 = ngraph::builder::makeConvolution(param0, type, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+ ngraph::op::PadType::EXPLICIT, 5);
+ auto result = std::make_shared<ngraph::opset1::Result>(conv1);
+ auto fn_ptr = std::make_shared<ngraph::Function>(ngraph::ResultVector{result}, ngraph::ParameterVector{param0});
+ return
+ fn_ptr;
+}
+
+static std::shared_ptr<ngraph::Function> makeMultiSingleConv(std::vector<size_t> inputShape = {1, 3, 24, 24}) {
+ ngraph::element::Type type = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(InferenceEngine::Precision::FP32);
+ auto param0 = std::make_shared<ngraph::opset1::Parameter>(type, ngraph::Shape(inputShape));
+ auto conv1 = ngraph::builder::makeConvolution(param0, type, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+ ngraph::op::PadType::EXPLICIT, 5);
+ auto conv2 = ngraph::builder::makeConvolution(conv1, type, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+ ngraph::op::PadType::EXPLICIT, 5);
+ auto conv3 = ngraph::builder::makeConvolution(conv2, type, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+ ngraph::op::PadType::EXPLICIT, 5);
+ auto conv4 = ngraph::builder::makeConvolution(conv3, type, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+ ngraph::op::PadType::EXPLICIT, 5);
+ auto conv5 = ngraph::builder::makeConvolution(conv4, type, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+ ngraph::op::PadType::EXPLICIT, 5);
+ auto conv6 = ngraph::builder::makeConvolution(conv5, type, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+ ngraph::op::PadType::EXPLICIT, 5);
+ auto conv7 = ngraph::builder::makeConvolution(conv6, type, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+ ngraph::op::PadType::EXPLICIT, 5);
+ auto conv8 = ngraph::builder::makeConvolution(conv7, type, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+ ngraph::op::PadType::EXPLICIT, 5);
+ auto conv9 = ngraph::builder::makeConvolution(conv8, type, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+ ngraph::op::PadType::EXPLICIT, 5);
+ auto conv10 = ngraph::builder::makeConvolution(conv9, type, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+ ngraph::op::PadType::EXPLICIT, 5);
+ auto result = std::make_shared<ngraph::opset1::Result>(conv10);
+ auto fn_ptr = std::make_shared<ngraph::Function>(ngraph::ResultVector{result}, ngraph::ParameterVector{param0});
+ return
+ fn_ptr;
+}
+
+static std::shared_ptr<ngraph::Function> make2InputSubtract(std::vector<size_t> inputShape = {1, 3, 24, 24},
+ InferenceEngine::Precision prc = InferenceEngine::Precision::FP32) {
+ ngraph::element::Type type = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(prc);
+ auto param0 = std::make_shared<ngraph::opset1::Parameter>(type, ngraph::Shape(inputShape));
+ auto param1 = std::make_shared<ngraph::opset1::Parameter>(type, ngraph::Shape(inputShape));
+ auto subtract = std::make_shared<ngraph::opset1::Subtract>(param0, param1);
+ auto result = std::make_shared<ngraph::opset1::Result>(subtract);
+ return std::make_shared<ngraph::Function>(ngraph::ResultVector{result}, ngraph::ParameterVector{param0, param1});
+}
+} // namespace subgraph
+} // namespace builder
+} // namespace ngraph
\ No newline at end of file
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporationconvert2OutputVector
// SPDX-License-Identifier: Apache-2.0
//
#include <vector>
#include <memory>
+#include <ngraph/runtime/interpreter/int_backend_visibility.hpp>
#include <ngraph/opsets/opset1.hpp>
#include <ngraph/runtime/backend_manager.hpp>
-#include <ngraph/component_manager.hpp>
#include <ngraph/runtime/backend.hpp>
#include <ngraph/runtime/tensor.hpp>
+extern "C" INTERPRETER_BACKEND_API void ngraph_register_interpreter_backend();
+
namespace ngraph {
namespace helpers {
ngraph::runtime::Backend::set_backend_shared_library_search_directory("");
ngraph_register_interpreter_backend();
+
auto backend = ngraph::runtime::Backend::create("INTERPRETER");
std::vector<std::shared_ptr<ngraph::runtime::Tensor>> inTensors;
}
return outData;
}
+
+std::vector<std::vector<std::uint8_t>> interpreterFunction(const std::shared_ptr<Function>& function, const std::vector<std::vector<std::uint8_t>>& inputs);
+
} // namespace helpers
} // namespace ngraph
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
+
#include <vector>
#include <memory>
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
+//
#include <vector>
#include <memory>
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ngraph_functions/builders.hpp"
+
+namespace ngraph {
+namespace builder {
+std::shared_ptr<ngraph::Node> makeStridedSlice(const ngraph::Output<Node> &in,
+ const std::vector<int64_t> &begin,
+ const std::vector<int64_t> &end,
+ const std::vector<int64_t> &stride,
+ const element::Type &type,
+ const std::vector<int64_t> &begin_mask,
+ const std::vector<int64_t> &end_mask,
+ const std::vector<int64_t> &new_axis_mask,
+ const std::vector<int64_t> &shrink_mask,
+ const std::vector<int64_t> &ellipsis_mask) {
+ ngraph::Shape constShape = {in.get_shape().size()};
+ auto beginNode = std::make_shared<ngraph::opset1::Constant>(ngraph::element::i64, constShape, begin.data());
+ auto endNode = std::make_shared<ngraph::opset1::Constant>(ngraph::element::i64, constShape, end.data());
+ auto strideNode = std::make_shared<ngraph::opset1::Constant>(ngraph::element::i64, constShape, stride.data());
+ auto ssNode = std::make_shared<ngraph::opset2::StridedSlice>(in, beginNode, endNode, strideNode, begin_mask, end_mask);
+ return ssNode;
+}
+
+} // namespace builder
+} // namespace ngraph
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporationconvert2OutputVector
// SPDX-License-Identifier: Apache-2.0
//
#include <ngraph/opsets/opset1.hpp>
+#include <ngraph_functions/utils/ngraph_helpers.hpp>
+
namespace ngraph {
namespace helpers {
return outs;
}
-template<class opType>
-ngraph::NodeVector castOps2Nodes(const std::vector<std::shared_ptr<opType>> &ops) {
- ngraph::NodeVector nodes;
- for (const auto &op : ops) {
- nodes.push_back(std::dynamic_pointer_cast<ngraph::Node>(op));
+std::vector<std::vector<std::uint8_t>> interpreterFunction(const std::shared_ptr<Function>& function, const std::vector<std::vector<std::uint8_t>>& inputs) {
+ ngraph::runtime::Backend::set_backend_shared_library_search_directory("");
+ ngraph_register_interpreter_backend();
+ auto backend = ngraph::runtime::Backend::create("INTERPRETER");
+
+ const auto& parameters = function->get_parameters();
+ const auto& parametersNumber = parameters.size();
+ const auto& inputsNumber = inputs.size();
+ NGRAPH_CHECK(parametersNumber == inputsNumber,
+ "Got function (", function->get_friendly_name(), ") with ", parametersNumber, " parameters, but ", inputsNumber, " input blobs");
+
+ auto inputTensors = std::vector<std::shared_ptr<runtime::Tensor>>{};
+ for (const auto& parameter : parameters) {
+ const auto& parameterIndex = function->get_parameter_index(parameter);
+ const auto& parameterShape = parameter->get_shape();
+ const auto& parameterType = parameter->get_element_type();
+ const auto& parameterSize = ngraph::shape_size(parameterShape) * parameterType.size();
+
+ const auto& input = inputs[parameterIndex];
+ const auto& inputSize = input.size();
+ NGRAPH_CHECK(parameterSize == inputSize,
+ "Got parameter (", parameter->get_friendly_name(), ") of size ", parameterSize, " bytes, but corresponding input with index ", parameterIndex,
+ " has ", inputSize, " bytes");
+
+ auto tensor = backend->create_tensor(parameterType, parameterShape);
+ tensor->write(input.data(), parameterSize);
+ inputTensors.push_back(tensor);
}
- return nodes;
+
+ auto outputTensors = std::vector<std::shared_ptr<runtime::Tensor>>{};
+ const auto& results = function->get_results();
+ std::transform(results.cbegin(), results.cend(), std::back_inserter(outputTensors), [&backend](const std::shared_ptr<op::Result>& result) {
+ return backend->create_tensor(result->get_element_type(), result->get_shape()); });
+
+ auto handle = backend->compile(function);
+ handle->call_with_validate(outputTensors, inputTensors);
+ auto outputs = std::vector<std::vector<std::uint8_t>>(results.size());
+ for (const auto& result : results) {
+ const auto& resultIndex = function->get_result_index(result);
+ auto& output = outputs[resultIndex];
+ output.resize(ngraph::shape_size(result->get_shape()) * result->get_element_type().size());
+ outputTensors[resultIndex]->read(output.data(), output.size());
+ }
+
+ return outputs;
}
} // namespace helpers
-# Copyright (C) 2019 Intel Corporation
+# Copyright (C) 2018-2020 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
#
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-
#define INTEL_GNA_DLLEXPORT 1
#if GNA_LIB_VER == 1
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gna/gna_config.hpp>
+#include "gna_plugin_config.hpp"
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+#include <map>
+
+using namespace InferenceEngine;
+using namespace GNAPluginNS;
+
+const std::map<std::string, std::string> supportedConfigKeysWithDefaults = {
+ {GNA_CONFIG_KEY(SCALE_FACTOR), "1.000000"},
+ {GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("_0"), "1.000000"},
+ {GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE), ""},
+ {GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE_GENERATION), ""},
+ {GNA_CONFIG_KEY(DEVICE_MODE), GNAConfigParams::GNA_SW_EXACT},
+ {GNA_CONFIG_KEY(COMPACT_MODE), CONFIG_VALUE(YES)},
+ {CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), CONFIG_VALUE(NO)},
+ {GNA_CONFIG_KEY(PRECISION), Precision(Precision::I16).name()},
+ {GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN), CONFIG_VALUE(NO)},
+ {CONFIG_KEY(PERF_COUNT), CONFIG_VALUE(NO)},
+ {GNA_CONFIG_KEY(LIB_N_THREADS), "1"},
+ {CONFIG_KEY(SINGLE_THREAD), CONFIG_VALUE(YES)}
+};
+
+class GNAPluginConfigTest : public ::testing::Test {
+protected:
+ Config config;
+ void SetAndCompare(const std::string& key, const std::string& val) {
+ config.UpdateFromMap({{key, val}});
+ EXPECT_EQ(config.GetParameter(key), val);
+ }
+ void ExpectThrow(const std::string& key, const std::string& val) {
+ EXPECT_THROW(config.UpdateFromMap({{key, val}}),
+ details::InferenceEngineException);
+ }
+ void SetAndCheckFlag(const std::string& key, bool& val, bool reverse = false) {
+ const bool yes = reverse ? false : true;
+ const bool no = !yes;
+ SetAndCompare(key, CONFIG_VALUE(YES));
+ EXPECT_EQ(val, yes);
+ SetAndCompare(key, CONFIG_VALUE(NO));
+ EXPECT_EQ(val, no);
+ SetAndCompare(key, CONFIG_VALUE(YES));
+ EXPECT_EQ(val, yes);
+ ExpectThrow(key, "abc");
+ ExpectThrow(key, "");
+ }
+};
+
+TEST_F(GNAPluginConfigTest, GnaConfigDefaultConfigIsExpected) {
+ ASSERT_EQ(config.key_config_map, supportedConfigKeysWithDefaults);
+}
+
+TEST_F(GNAPluginConfigTest, GnaConfigScaleFactorTest) {
+ config.UpdateFromMap({{GNA_CONFIG_KEY(SCALE_FACTOR), std::string("34")}});
+ EXPECT_EQ(config.GetParameter(GNA_CONFIG_KEY(SCALE_FACTOR)), std::string("34.000000"));
+ EXPECT_EQ(config.GetParameter(GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("_0")), std::string("34.000000"));
+ EXPECT_EQ(config.inputScaleFactors.size(), 1);
+ EXPECT_FLOAT_EQ(config.inputScaleFactors[0], 34.0);
+
+ config.UpdateFromMap({{GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("_3"), std::string("15.2")}});
+ EXPECT_EQ(config.GetParameter(GNA_CONFIG_KEY(SCALE_FACTOR)), std::string("34.000000"));
+ EXPECT_EQ(config.GetParameter(GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("_0")), std::string("34.000000"));
+ EXPECT_EQ(config.GetParameter(GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("_1")), std::string("1.000000"));
+ EXPECT_EQ(config.GetParameter(GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("_2")), std::string("1.000000"));
+ EXPECT_EQ(config.GetParameter(GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("_3")), std::string("15.200000"));
+ EXPECT_EQ(config.inputScaleFactors.size(), 4);
+ EXPECT_FLOAT_EQ(config.inputScaleFactors[0], 34.0);
+ EXPECT_FLOAT_EQ(config.inputScaleFactors[1], 1.0);
+ EXPECT_FLOAT_EQ(config.inputScaleFactors[2], 1.0);
+ EXPECT_FLOAT_EQ(config.inputScaleFactors[3], 15.2);
+
+ config.UpdateFromMap({{GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("_9"), std::string("8.43")}});
+ EXPECT_EQ(config.GetParameter(GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("_9")), std::string("8.430000"));
+ EXPECT_EQ(config.inputScaleFactors.size(), 10);
+ EXPECT_FLOAT_EQ(config.inputScaleFactors[9], 8.43);
+
+ ExpectThrow(GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("_10"), std::string("8.43"));
+ ExpectThrow(GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("&1"), std::string("8.43"));
+ ExpectThrow(GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("_"), std::string("8.43"));
+ ExpectThrow(GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("abs"), std::string("8.43"));
+ ExpectThrow(GNA_CONFIG_KEY(SCALE_FACTOR), std::string("abc"));
+ ExpectThrow(GNA_CONFIG_KEY(SCALE_FACTOR), std::string("0"));
+}
+
+TEST_F(GNAPluginConfigTest, GnaConfigFirmwareModelImageTest) {
+ SetAndCompare(GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE), "abc");
+ EXPECT_EQ(config.dumpXNNPath, "abc");
+}
+
+TEST_F(GNAPluginConfigTest, GnaConfigFirmwareModelImageGeneratorTest) {
+ SetAndCompare(GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE_GENERATION), "def");
+ EXPECT_EQ(config.dumpXNNGeneration, "def");
+}
+
+TEST_F(GNAPluginConfigTest, GnaConfigDeviceModeTest) {
+ SetAndCompare(GNA_CONFIG_KEY(DEVICE_MODE), GNAConfigParams::GNA_HW);
+#if GNA_LIB_VER == 1
+ EXPECT_EQ(config.gna_proc_type, static_cast<intel_gna_proc_t>(GNA_HARDWARE));
+#else
+ EXPECT_EQ(config.pluginGna2AccMode, Gna2AccelerationModeHardware);
+ EXPECT_EQ(config.pluginGna2DeviceConsistent, Gna2DeviceVersionSoftwareEmulation);
+#endif
+ SetAndCompare(GNA_CONFIG_KEY(DEVICE_MODE), GNAConfigParams::GNA_SW);
+#if GNA_LIB_VER == 1
+ EXPECT_EQ(config.gna_proc_type, static_cast<intel_gna_proc_t>(GNA_SOFTWARE));
+#else
+ EXPECT_EQ(config.pluginGna2AccMode, Gna2AccelerationModeSoftware);
+ EXPECT_EQ(config.pluginGna2DeviceConsistent, Gna2DeviceVersionSoftwareEmulation);
+#endif
+ SetAndCompare(GNA_CONFIG_KEY(DEVICE_MODE), GNAConfigParams::GNA_SW_EXACT);
+#if GNA_LIB_VER == 1
+ EXPECT_EQ(config.gna_proc_type, static_cast<intel_gna_proc_t>(GNA_SOFTWARE & GNA_HARDWARE));
+#else
+ EXPECT_EQ(config.pluginGna2AccMode, Gna2AccelerationModeSoftware);
+ EXPECT_EQ(config.pluginGna2DeviceConsistent, Gna2DeviceVersion1_0);
+#endif
+ SetAndCompare(GNA_CONFIG_KEY(DEVICE_MODE), GNAConfigParams::GNA_AUTO);
+#if GNA_LIB_VER == 1
+ EXPECT_EQ(config.gna_proc_type, static_cast<intel_gna_proc_t>(GNA_AUTO));
+#else
+ EXPECT_EQ(config.pluginGna2AccMode, Gna2AccelerationModeAuto);
+ EXPECT_EQ(config.pluginGna2DeviceConsistent, Gna2DeviceVersionSoftwareEmulation);
+#endif
+ ExpectThrow(GNA_CONFIG_KEY(DEVICE_MODE), "");
+ ExpectThrow(GNA_CONFIG_KEY(DEVICE_MODE), "abc");
+}
+
+TEST_F(GNAPluginConfigTest, GnaConfigCompactMode) {
+ SetAndCheckFlag(GNA_CONFIG_KEY(COMPACT_MODE),
+ config.gnaFlags.compact_mode);
+}
+
+TEST_F(GNAPluginConfigTest, GnaConfigExclusiveAsyncRequestTest) {
+ SetAndCheckFlag(CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS),
+ config.gnaFlags.exclusive_async_requests);
+}
+
+TEST_F(GNAPluginConfigTest, GnaConfigPrecisionTest) {
+ SetAndCompare(GNA_CONFIG_KEY(PRECISION), Precision(Precision::I8).name());
+ EXPECT_EQ(config.gnaPrecision, Precision::I8);
+ SetAndCompare(GNA_CONFIG_KEY(PRECISION), Precision(Precision::I16).name());
+ EXPECT_EQ(config.gnaPrecision, Precision::I16);
+ ExpectThrow(GNA_CONFIG_KEY(PRECISION), Precision(Precision::FP32).name());
+ ExpectThrow(GNA_CONFIG_KEY(PRECISION), "");
+}
+
+TEST_F(GNAPluginConfigTest, GnaConfigPwlUniformDesignTest) {
+ SetAndCheckFlag(GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN),
+ config.gnaFlags.uniformPwlDesign);
+}
+
+TEST_F(GNAPluginConfigTest, GnaConfigPerfCountTest) {
+ SetAndCheckFlag(CONFIG_KEY(PERF_COUNT),
+ config.gnaFlags.performance_counting);
+}
+
+TEST_F(GNAPluginConfigTest, GnaConfigLibNThreadsTest) {
+ SetAndCompare(GNA_CONFIG_KEY(LIB_N_THREADS), "2");
+ EXPECT_EQ(config.gnaFlags.gna_lib_async_threads_num, 2);
+ SetAndCompare(GNA_CONFIG_KEY(LIB_N_THREADS), "25");
+ EXPECT_EQ(config.gnaFlags.gna_lib_async_threads_num, 25);
+ ExpectThrow(GNA_CONFIG_KEY(LIB_N_THREADS), "");
+ ExpectThrow(GNA_CONFIG_KEY(LIB_N_THREADS), "0");
+ ExpectThrow(GNA_CONFIG_KEY(LIB_N_THREADS), "128");
+ ExpectThrow(GNA_CONFIG_KEY(LIB_N_THREADS), "abc");
+}
+
+TEST_F(GNAPluginConfigTest, GnaConfigSingleThreadTest) {
+ SetAndCheckFlag(CONFIG_KEY(SINGLE_THREAD),
+ config.gnaFlags.gna_openmp_multithreading,
+ true);
+}
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
# SPDX-License-Identifier: Apache-2.0
#
-set(TARGET_NAME helpers)
+set(TARGET_NAME ieTestHelpers)
file(GLOB HELPERS_SRC
${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
## Enable Models multiple search pathes
message("configuring file: ${CMAKE_CURRENT_BINARY_DIR}/test_model_repo.h")
-configure_file(test_model_repo.hpp.in ${CMAKE_CURRENT_BINARY_DIR}/test_model_repo.hpp @ONLY)
function(add_helpers target_name)
add_library(${target_name} STATIC ${HELPERS_SRC})
target_include_directories(${target_name} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}"
"${IE_MAIN_SOURCE_DIR}/src/inference_engine"
+ $<TARGET_PROPERTY:inference_engine_ir_readers,INTERFACE_INCLUDE_DIRECTORIES>
$<TARGET_PROPERTY:inference_engine_lp_transformations,INTERFACE_INCLUDE_DIRECTORIES>
$<TARGET_PROPERTY:pugixml,INTERFACE_INCLUDE_DIRECTORIES>
"${IE_MAIN_SOURCE_DIR}/src/vpu/"
target_include_directories(${target_name} PUBLIC
"${IE_MAIN_SOURCE_DIR}/samples/common/os/windows")
- target_compile_definitions(${target_name} PUBLIC ${ARGV}
- MODELS_PATH=\"${MODELS_PATH}\" DATA_PATH=\"${VALIDATION_SET}\")
-
set_property(TARGET ${target_name} PROPERTY COMPILE_PDB_NAME ${target_name})
# add_cpplint_target(${target_name}_cpplint FOR_TARGETS ${target_name})
fp16_ptr = blob->buffer().as<ie_fp16*>();
} else if (precision == Precision::FP32) {
fp32_ptr = blob->buffer().as<float*>();
+ } else if (precision == Precision::I32) {
+ i32_ptr = blob->buffer().as<int32_t*>();
} else {
THROW_IE_EXCEPTION << "Unsupported precision for compare: " << precision;
}
}
float BufferWrapper::operator[](size_t index) {
- if (precision == Precision::FP16) return PrecisionUtils::f16tof32(fp16_ptr[index]);
+ if (precision == Precision::FP16) {
+ return PrecisionUtils::f16tof32(fp16_ptr[index]);
+ } else if (precision == Precision::I32) {
+ return i32_ptr[index];
+ }
return fp32_ptr[index];
}
void BufferWrapper::insert(size_t index, float value) {
if (precision == Precision::FP16) {
fp16_ptr[index] = PrecisionUtils::f32tof16(value);
- } else {
+ } else if (precision == Precision::I32) {
+ i32_ptr[index] = value;
+ }
+ else {
fp32_ptr[index] = value;
}
}
InferenceEngine::Precision precision;
InferenceEngine::ie_fp16 *fp16_ptr;
float *fp32_ptr;
+ int32_t *i32_ptr;
public:
explicit BufferWrapper(const InferenceEngine::Blob::Ptr &blob);
}
#endif
-const char* getModelPathNonFatal() noexcept {
-#ifdef MODELS_PATH
- const char* models_path = std::getenv("MODELS_PATH");
-
- if (models_path == nullptr && MODELS_PATH == nullptr) {
- return nullptr;
- }
-
- if (models_path == nullptr) {
- return MODELS_PATH;
- }
-
- return models_path;
-#else
- return nullptr;
-#endif
-}
-
-
static std::string get_models_path() {
- const char* models_path = getModelPathNonFatal();
+ const char* models_path = TestDataHelpers::getModelPathNonFatal();
if (nullptr == models_path) {
::testing::AssertionFailure() << "MODELS_PATH not defined";
}
ModelsPath::operator std::string() const {
-
std::vector<std::string> absModelsPath;
for (auto & path : getModelsDirs()) {
- const auto absPath = get_models_path() + kPathSeparator + "src" + kPathSeparator + path + _rel_path.str();
+ std::string b = get_models_path();
+ const auto absPath = get_models_path() + kPathSeparator + path + _rel_path.str();
absModelsPath.push_back(absPath);
if (exist(absPath)) {
return absPath;
--- /dev/null
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include <string>
+
+std::string get_model_repo();
+
+namespace TestDataHelpers {
+
+const char *getModelPathNonFatal() noexcept;
+
+std::string get_data_path();
+
+inline const char *getModelPathNonFatalDefault() noexcept {
+#ifdef MODELS_PATH
+ const char *models_path = std::getenv("MODELS_PATH");
+
+ if (models_path == nullptr && MODELS_PATH == nullptr) {
+ return nullptr;
+ }
+
+ if (models_path == nullptr) {
+ return MODELS_PATH;
+ }
+
+ return models_path;
+#else
+ return nullptr;
+#endif
+};
+
+inline std::string get_data_path_default() {
+#ifdef DATA_PATH
+ const char *data_path = std::getenv("DATA_PATH");
+
+ if (data_path == NULL) {
+ if (DATA_PATH != NULL) {
+ data_path = DATA_PATH;
+ } else {
+ return nullptr;
+ }
+ }
+ return std::string(data_path);
+#else
+ return nullptr;
+#endif
+}
+} // namespace TestDataHelpers
+++ /dev/null
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-#include <string>
-std::string get_model_repo() {
- return "@MODELS_LST@";
-}
\ No newline at end of file
std::make_shared<LayerTestCreator<InferenceEngine::ReduceLayer>>("ReduceSumSquare"),
std::make_shared<LayerTestCreator<InferenceEngine::TopKLayer>>("TopK"),
std::make_shared<LayerTestCreator<InferenceEngine::NonMaxSuppressionLayer>>("NonMaxSuppression"),
- std::make_shared<LayerTestCreator<InferenceEngine::ScatterLayer>>("ScatterUpdate")
+ std::make_shared<LayerTestCreator<InferenceEngine::ScatterUpdateLayer>>("ScatterUpdate")
};
return creators;
}
#include <ie_input_info.hpp>
#include <ie_icnn_network.hpp>
+#include "test_model_repo.hpp"
#include "test_model_path.hpp"
#include <tests_file_utils.hpp>
#include <chrono>
class TestsCommon : public ::testing::Test {
public:
IE_SUPPRESS_DEPRECATED_START
- static InferenceEngine::CNNLayer::Ptr createLayer(const std::string& type);
+
+ static InferenceEngine::CNNLayer::Ptr createLayer(const std::string &type);
+
IE_SUPPRESS_DEPRECATED_END
protected:
void SetUp() override;
+
void TearDown() override;
public:
return make_plugin_name("mock_engine");
}
- static std::string get_data_path(){
- const char* data_path = std::getenv("DATA_PATH");
-
- if (data_path == NULL){
- if(DATA_PATH != NULL){
- data_path = DATA_PATH;
- } else{
- ::testing::AssertionFailure()<<"DATA_PATH not defined";
- }
- }
- return std::string(data_path);
- }
-
static std::string make_so_name(const std::string & input) {
return CommonTestUtils::pre + input + IE_BUILD_POSTFIX + CommonTestUtils::ext;
}
target_link_libraries(${TARGET_NAME} PRIVATE
# static libraries
+ inference_engine_s # need to have this explicitly for USE_STATIC_IE
unitTestUtils
- helpers_s
+ ieTestHelpers_s
${GNA_TEST_ENGINE}
# dynamic libraries
inference_engine_lp_transformations
+ inference_engine_ir_readers
inference_engine_transformations
- ${CMAKE_DL_LIBS}
- )
+ ${CMAKE_DL_LIBS})
if(TARGET libGNAStubs)
target_link_libraries(${TARGET_NAME} PRIVATE libGNAStubs)
target_link_libraries(${TARGET_NAME} PRIVATE mkldnn)
endif ()
+if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fuse-ld=gold")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fuse-ld=gold")
+endif()
+
add_test(NAME ${TARGET_NAME}
COMMAND ${TARGET_NAME})
.node("net")
.attr("name", "AlexNet").attr("version", 2);
- ASSERT_THROW(parse(content), InferenceEngine::details::InferenceEngineException);
+ // TODO: fix RTTI issue and replace by InferenceEngine::details::InferenceEngineException
+ ASSERT_THROW(parse(content), std::exception);
}
TEST_F (V2FormatParserTest, canParseDims) {
case GnaPluginTestEnvironment::matchAffineWeights:
HasWeightsEq(combined, _env.transposedData);
break;
+ case GnaPluginTestEnvironment::matchAffineWeightsSize:
+ HasWeightsSizeEq(combined, _env.matched_weight_size);
+ break;
case GnaPluginTestEnvironment::saveAffineWeights:
SaveWeights(combined, _env.transposedData, _env.transposedArgsForSaving);
break;
fillOutputValues,
matchAffineWeightsTranspose,
matchAffineWeights,
- saveAffineWeights
+ matchAffineWeightsSize,
+ saveAffineWeights,
};
enum {
kUnset = -1,
std::pair<int, int> transposedArgsForSaving;
std::vector<uint16_t>* transposedData;
std::vector<DnnActivationType> pwlsToMatchWith;
+ size_t matched_weight_size = 0;
+ size_t nCopyLayersToMatch = -1;
};
class GNATestBase {
_env.config[keyName] = ss.str();
return *dynamic_cast<T*>(this);
}
+ T & onCPU() {
+ _env.config[GNA_CONFIG_KEY(DEVICE_MODE)] = GNA_CONFIG_VALUE(SW_FP32);
+ return *dynamic_cast<T*>(this);
+ }
+ T & withPolicy(GNAPluginNS::Policy::ConcatAlignment concatAlignmentPolicy) {
+ _env.policy.ConcatAlignmentPolicy = concatAlignmentPolicy;
+ return *dynamic_cast<T*>(this);
+ }
T & withGNADeviceMode(std::string value) {
_env.config[GNA_CONFIG_KEY(DEVICE_MODE)] = value;
return *dynamic_cast<T*>(this);
return *this;
}
- GNAPropagateMatcher & And() {
- return *this;
- }
-
GNAPropagateMatcher & that() {
return *this;
}
return *this;
}
-
GNAPropagateMatcher & once() {
return times(1);
}
return *this;
}
-
GNAPropagateMatcher & affine_weights_transpozed(std::pair<int, int> &&transpozedArgs) {
getMatcher().type = GnaPluginTestEnvironment::saveAffineWeights;
_env.transposedArgsForSaving = std::move(transpozedArgs);
return *this;
}
- GNAPropagateMatcher & onCPU() {
- _env.config[GNA_CONFIG_KEY(DEVICE_MODE)] = GNA_CONFIG_VALUE(SW_FP32);
- return *this;
- }
-
protected:
void match();
intel_nnet_type_t * original_nnet = nullptr;
void match();
};
+/**
+ * @brief weights matcher has specific weights matching methods
+ */
+class GNAWeightsMatcher : public GNAPropagateMatcher {
+ public:
+ using base = GNAPropagateMatcher;
+ using base::base;
+
+ GNAWeightsMatcher & size() {
+ getMatcher().type = GnaPluginTestEnvironment::matchAffineWeightsSize;
+ return *this;
+ }
+ GNAWeightsMatcher & equals_to(size_t weights_size) {
+ if (getMatcher().type == GnaPluginTestEnvironment::matchAffineWeightsSize) {
+ _env.matched_weight_size = weights_size;
+ }
+ return *this;
+ }
+};
+
/**
_env.model = _model;
return *this;
}
+ GNATest & afterLoadingModel(std::shared_ptr<ngraph::Function> ngraph_model) {
+ _env.ngraph_model = ngraph_model;
+ return *this;
+ }
+
+ GNAWeightsMatcher & affine_weights() {
+ returnedMatchers.push_back(std::make_shared<GNAWeightsMatcher>(_env));
+ _env = GnaPluginTestEnvironment();
+ return dynamic_cast<GNAWeightsMatcher&>(*returnedMatchers.back());
+ }
GNAQueryStateMatcher & queryState() {
returnedMatchers.push_back(std::make_shared<GNAQueryStateMatcher>(_env));
_env = GnaPluginTestEnvironment();
return dynamic_cast<GNAPropagateMatcher&>(*returnedMatchers.back());
}
+
GNATest & importedFrom(std::string fileName) {
_env.importedModelFileName = fileName;
return *this;
}
+
GNATest & onInferModel(std::string _model = "",
std::function<void (InferenceEngine::CNNNetwork &)> _cb = [](InferenceEngine::CNNNetwork & net){}) {
_env.model = _model;
--- /dev/null
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <tuple>
+#include <vector>
+#include <gtest/gtest.h>
+#include <single_layer_common.hpp>
+#include <ngraph/op/parameter.hpp>
+#include <ngraph/ops.hpp>
+#include <ie_precision.hpp>
+#include "../gna_matcher.hpp"
+
+using GNAAlignFilterTestParams = std::tuple<InferenceEngine::Precision, GNAPluginNS::Policy::ConcatAlignment, std::size_t, std::size_t>;
+using namespace GNAPluginNS;
+
+class GNAAlignFilterTest : public GNATest<>,
+ public testing::WithParamInterface<GNAAlignFilterTestParams> {
+ public:
+
+ static std::string getTestName(const testing::TestParamInfo<GNAAlignFilterTestParams>& params) {
+ std::string test_name;
+ if (std::get<1>(params.param) == GNAPluginNS::Policy::ConcatAlignment::FAST) {
+ test_name += "fast_";
+ }
+ test_name += "concat_of(" + std::to_string(std::get<2>(params.param));
+ test_name += "_" + std::to_string(std::get<3>(params.param));
+ test_name += ")_on_";
+ test_name += std::get<0>(params.param).name();
+ return test_name;
+ }
+
+ protected:
+
+ InferenceEngine::Precision precision = InferenceEngine::Precision::FP32;
+ std::size_t concat_inputs[2];
+ GNAPluginNS::Policy::ConcatAlignment alignmentPolicy;
+
+ void SetUp() override {
+ std::tie(precision, alignmentPolicy, concat_inputs[0], concat_inputs[1]) = GetParam();
+ }
+
+ std::shared_ptr<ngraph::Function> getNgraphModel() {
+ auto input0 = std::make_shared<ngraph::op::Parameter>(ngraph::element::f32, ngraph::Shape{1, concat_inputs[0]});
+ auto input1 = std::make_shared<ngraph::op::Parameter>(ngraph::element::f32, ngraph::Shape{1, concat_inputs[1]});
+
+ auto relu0 = std::make_shared<ngraph::op::v0::Relu>(input0);
+ auto relu1 = std::make_shared<ngraph::op::v0::Relu>(input1);
+
+ auto concat = std::make_shared<ngraph::op::Concat>(ngraph::NodeVector{relu0, relu1}, 1);
+
+ auto relu3 = std::make_shared<ngraph::op::v0::Relu>(concat);
+
+ auto function = std::make_shared<ngraph::Function>(ngraph::NodeVector{relu3}, ngraph::ParameterVector{input0, input1});
+ return function;
+ }
+};
+
+TEST_P(GNAAlignFilterTest, concatWith_2_Inputs_Small_mem_footprint) {
+
+ auto ngraf = getNgraphModel();
+ if (precision == InferenceEngine::Precision::FP32) {
+ GTEST_SKIP() << "FP32 case - won't produce gna primitives";
+ }
+
+ // calc expected weight size
+ size_t expected_affine_size = 0;
+ size_t expected_copy_layers = 0;
+
+ auto getFastAffineFilterParams = [](size_t sz) -> std::pair<size_t, size_t> {
+ //align first input by 8
+ auto copy_N = sz > 32 ? 1 : 0; // number of copy layers
+ auto firstFilter_frac = sz % 32;
+ auto firstFilter_N = ALIGN(firstFilter_frac, 8);
+
+ return {copy_N, firstFilter_N * firstFilter_frac};
+ };
+
+ auto getNumCopyElements = [&getFastAffineFilterParams](size_t sz) {
+ return getFastAffineFilterParams(sz).first;
+ };
+ auto getsNumFilterWeights = [&getFastAffineFilterParams](size_t sz) {
+ return getFastAffineFilterParams(sz).second;
+ };
+
+ switch(alignmentPolicy) {
+ case Policy::ConcatAlignment::ENABLED : {
+ //align first input by 8
+ auto firstFilter = ALIGN(concat_inputs[0], 8) * concat_inputs[0];
+ //align first input by 8
+ auto extraLeftElementsForSecond = concat_inputs[0] + 32 - ALIGN(concat_inputs[0], 32);
+
+ auto secondFilter = ALIGN(concat_inputs[1], 8) * (extraLeftElementsForSecond + concat_inputs[1]);
+
+ expected_affine_size = firstFilter + secondFilter;
+ break;
+ }
+ case Policy::ConcatAlignment::FAST : {
+
+ expected_copy_layers = getNumCopyElements(concat_inputs[0]);
+ expected_affine_size = getsNumFilterWeights(concat_inputs[0]);
+
+ // calculation size for second filter
+ auto offset = ALIGN(concat_inputs[0], 32) - 32;
+ auto zerolen = concat_inputs[0] - offset;
+ auto second_output_len = zerolen + concat_inputs[1];
+
+ expected_affine_size += second_output_len * ALIGN(concat_inputs[1], 8);
+ break;
+ }
+
+ default : {
+ FAIL() << "unsupported align policy: " << alignmentPolicy;
+ }
+ }
+
+ assert_that().onInferNgraphModel(ngraf)
+ .inNotCompactMode()
+ .withPolicy(alignmentPolicy)
+ .withGNAConfig(std::string(GNA_CONFIG_KEY(SCALE_FACTOR)) + "_0", 1.0f)
+ .withGNAConfig(std::string(GNA_CONFIG_KEY(SCALE_FACTOR)) + "_1", 1.0f)
+ .withGNAConfig(GNA_CONFIG_KEY(PRECISION), precision.name())
+ .gna()
+ .affine_weights()
+ .size()
+ .equals_to(expected_affine_size)
+ .And()
+ .copy_inserted_into_nnet()
+ .times(expected_copy_layers);
+}
+
+TEST_P(GNAAlignFilterTest, concatWith_2_Inputs_accurate) {
+ auto ngraf = getNgraphModel();
+ if (precision == InferenceEngine::Precision::FP32) {
+ std::vector<std::vector<float>> input_data;
+ float start_value = 1.0;
+
+ for (auto dim : concat_inputs) {
+ if (dim > 0) {
+ input_data.push_back(std::vector<float>(dim));
+
+ std::iota(input_data.back().begin(), input_data.back().end(), start_value);
+ start_value += dim;
+ }
+ }
+
+ std::vector<float> expected_result(static_cast<size_t>(start_value - 1));
+ start_value = 1.0;
+ std::iota(expected_result.begin(), expected_result.end(), start_value);
+ assert_that().onInferNgraphModel(ngraf)
+ .inNotCompactMode()
+ .gna()
+ .propagate_forward()
+ .onCPU()
+ .withPolicy(alignmentPolicy)
+ .called_with()
+ .input(ngraf->get_parameters().at(0)->get_name(), input_data[0])
+ .input(ngraf->get_parameters().at(1)->get_name(), input_data[1])
+ .equals_to(expected_result);
+ } else {
+ assert_that().onInferNgraphModel(ngraf)
+ .inNotCompactMode()
+ .gna()
+ .withPolicy(alignmentPolicy)
+ .withGNAConfig(std::string(GNA_CONFIG_KEY(SCALE_FACTOR)) + "_0", 1.0f)
+ .withGNAConfig(std::string(GNA_CONFIG_KEY(SCALE_FACTOR)) + "_1", 1.0f)
+ .withGNAConfig(GNA_CONFIG_KEY(PRECISION), "I16")
+ .propagate_forward()
+ .called();
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(
+ GNALayerTests,
+ GNAAlignFilterTest,
+ testing::Combine(
+ testing::Values(InferenceEngine::Precision::FP32, InferenceEngine::Precision::I16),
+ //fast or not fast alignment policy
+ testing::Values(GNAPluginNS::Policy::ConcatAlignment::FAST, GNAPluginNS::Policy::ConcatAlignment::ENABLED),
+ // Size of first Split layer output
+ testing::Values(31, 49),
+ // Size of second Split layer output
+ testing::Values(31, 73)),
+ GNAAlignFilterTest::getTestName);
class CopyLayerMatcher : public ::testing::MatcherInterface<const intel_nnet_type_t*> {
bool matchInserted;
const int matchQuantity;
+ mutable int actualNumberOfCopyLayers;
public:
CopyLayerMatcher(bool matchInserted, int matchQuantity) : matchInserted(matchInserted), matchQuantity(matchQuantity) {}
bool MatchAndExplain(const intel_nnet_type_t *foo, ::testing::MatchResultListener *listener) const override {
if (foo == nullptr)
return false;
+ actualNumberOfCopyLayers = 0;
+
for(int i = 0; i < foo->nLayers; i++) {
if (foo->pLayers[i].nLayerKind != INTEL_COPY) continue;
- return matchInserted;
+
+ if (!matchInserted) {
+ return false;
+ }
+ actualNumberOfCopyLayers ++;
+ }
+ if (matchQuantity == -1) {
+ if (actualNumberOfCopyLayers > 0) {
+ return true;
+ }
+ return false;
+ }
+ if (actualNumberOfCopyLayers != matchQuantity) {
+ return false;
}
- return !matchInserted;
+ return true;
};
void DescribeTo(::std::ostream *os) const override {
- *os << "should "<< (matchInserted ? "" : "not ") << "have Copy primitive as part of nnet structure";
+ *os << "should "<< (matchInserted ? "" : "not ") << "have " << (matchInserted ? std::to_string(matchQuantity) : "" )
+ << " Copy primitives as part of nnet structure" << (matchInserted ? std::string(" but was only: ") + std::to_string(actualNumberOfCopyLayers) + " copy layers" : "" );
}
};
auto affine = (intel_affine_func_t*)foo->pLayers[i].pLayerStruct;
auto affineWeightsSize = foo->pLayers[i].nOutputRows *
- foo->pLayers[i].nLayerKind == INTEL_AFFINE_DIAGONAL ? 1 : foo->pLayers[i].nInputRows;
+ (foo->pLayers[i].nLayerKind == INTEL_AFFINE_DIAGONAL ? 1 : foo->pLayers[i].nInputRows);
if (affineWeightsSize != std::get<0>(transpozedData)->size()) {
error << "gna-xnn layer(" << i << ") weights size mismatch: expected "
}
};
+class WeightsSizeMatcher : public ::testing::MatcherInterface<const intel_nnet_type_t*> {
+ enum HowMatch{
+ eNone,
+ eEqAffine,
+ } eMatchKind;
+
+ mutable std::stringstream error;
+ mutable int actual;
+ size_t expected_weights_size;
+ public:
+ explicit WeightsSizeMatcher(const size_t data_len) :
+ eMatchKind(eEqAffine),
+ expected_weights_size(data_len){
+ }
+ bool MatchAndExplain(const intel_nnet_type_t *foo, ::testing::MatchResultListener *listener) const override {
+ if (foo == nullptr)
+ return false;
+
+ size_t sizeTotal = 0;
+ std::stringstream ss;
+ for(int i = 0; i < foo->nLayers; i++) {
+ if (foo->pLayers[i].nLayerKind != INTEL_AFFINE && eMatchKind == eEqAffine) continue;
+
+ auto affineWeightsSize = foo->pLayers[i].nOutputRows *
+ (foo->pLayers[i].nLayerKind == INTEL_AFFINE_DIAGONAL ? 1 : foo->pLayers[i].nInputRows);
+
+ sizeTotal += affineWeightsSize;
+ ss << "[" << i << "]: " << affineWeightsSize << ", ";
+
+ }
+
+ if (eMatchKind == eEqAffine && sizeTotal != expected_weights_size) {
+ error << "gna-affine layers " << ss.str() << " have diff total weights size : " << sizeTotal
+ << ", while expected to have: " << expected_weights_size << "\n";
+ return false;
+ }
+ return true;
+ };
+ void DescribeTo(::std::ostream *os) const override {
+ *os << error.str() << std::endl;
+ }
+};
+
class WeightsSaver: public ::testing::MatcherInterface<const intel_nnet_type_t*> {
mutable TranspozeIterator iterator;
components->add(new WeightsSaver(make_tuple(data, dims.first, dims.second)));
}
+void HasWeightsSizeEq(std::unique_ptr<NNetComponentMatcher>& components, size_t weights_size) {
+ components->add(new WeightsSizeMatcher(weights_size));
+}
+
std::string deser_header, ref_header = "U8 4D shape: 2 3 4 5 (120)";
std::getline(buff, deser_header);
+ deser_header = deser_header.substr(0, ref_header.length());
ASSERT_EQ(deser_header, ref_header);
auto num_line = std::count(std::istreambuf_iterator<char>(buff),
std::string deser_header, ref_header = "FP32 2D shape: 2 3 (6)";
std::getline(buff, deser_header);
+ deser_header = deser_header.substr(0, ref_header.length());
ASSERT_EQ(deser_header, ref_header);
auto num_line = std::count(std::istreambuf_iterator<char>(buff),
// SPDX-License-Identifier: Apache-2.0
//
+
#include "common_test_utils/data_utils.hpp"
#include "mkldnn_graph.h"
#include "test_graph.hpp"
}
};
-TEST_P(MKLDNNCPUExtScatterTFTests, TestsScatter) {}
+// Disabled these tests as they need to adjust with new specs:
+// - new Scatter Update layer: like TF scatter_update
+// - new Scatter Elements Update: like ONNX Scatter Elements
+// See merge requests:
+// DLDT #6005: Specification for the ScatterElementsUpdate layer
+// DLDT #6091: Specification for ScatterUpdate operation
+TEST_P(MKLDNNCPUExtScatterTFTests, DISABLED_TestsScatter) {}
INSTANTIATE_TEST_CASE_P(
TestsScatter, MKLDNNCPUExtScatterTFTests,
pipeline.run(model);
- ASSERT_EQ(data1->location(), DataLocation::CMX);
+ ASSERT_EQ(data1->dataLocation().location, Location::CMX);
ASSERT_EQ(data1->numConsumers(), 1);
auto data1Consumer = data1->singleConsumer();
auto data1ConsumerOutput = data1Consumer->output(0);
ASSERT_EQ(data1Consumer->type(), StageType::Copy);
- ASSERT_EQ(data1ConsumerOutput->location(), DataLocation::BSS);
+ ASSERT_EQ(data1ConsumerOutput->dataLocation().location, Location::BSS);
ASSERT_EQ(data1ConsumerOutput->numChildDatas(), 4);
ASSERT_TRUE(contains(data1ConsumerOutput->childDataEdges(), [data2](const SharedAllocation& e) { return e->child() == data2; }));
ASSERT_TRUE(contains(data1ConsumerOutput->childDataEdges(), [data3](const SharedAllocation& e) { return e->child() == data3; }));
pipeline.run(model);
auto hw1Output = hw1->output(0);
- ASSERT_EQ(hw1Output->location(), DataLocation::CMX);
+ ASSERT_EQ(hw1Output->dataLocation().location, Location::CMX);
auto copyStage = hw1Output->singleConsumer();
ASSERT_EQ(copyStage->type(), StageType::Copy);
auto copyStageOutput = copyStage->output(0);
- ASSERT_EQ(copyStageOutput->location(), DataLocation::BSS);
+ ASSERT_EQ(copyStageOutput->dataLocation().location, Location::BSS);
ASSERT_EQ(copyStageOutput->numConsumers(), 2);
for (const auto& copyStageOutputConsumer : copyStageOutput->consumers()) {
//
#include "graph_transformer_tests.hpp"
+#include <vpu/model/data_contents/replicated_data_content.hpp>
+
#include <precision_utils.h>
using namespace vpu;
const auto model = CreateModel();
- const auto constData1 = model->addConstData("const1", dataDesc1, replicateContent(1.0f, dataDesc1.totalDimSize()));
- const auto constData2 = model->addConstData("const2", dataDesc2, replicateContent(2.0f, dataDesc2.totalDimSize()));
+ const auto constData1 = model->addConstData("const1", dataDesc1, replicateContent(1.0f, dataDesc1.totalDimSize(), dataDesc1));
+ const auto constData2 = model->addConstData("const2", dataDesc2, replicateContent(2.0f, dataDesc2.totalDimSize(), dataDesc2));
const auto concatData = model->addNewData("concat", dataDescConcat);
const auto model = CreateModel();
- const auto constData1 = model->addConstData("const1", dataDesc1, replicateContent(1.0f, dataDesc1.totalDimSize()));
- const auto constData2 = model->addConstData("const2", dataDesc2, replicateContent(2.0f, dataDesc2.totalDimSize()));
+ const auto constData1 = model->addConstData("const1", dataDesc1, replicateContent(1.0f, dataDesc1.totalDimSize(), dataDesc1));
+ const auto constData2 = model->addConstData("const2", dataDesc2, replicateContent(2.0f, dataDesc2.totalDimSize(), dataDesc2));
const auto concatData = model->addNewData("concat", dataDescConcat);
// SPDX-License-Identifier: Apache-2.0
//
-#include <initializer_list>
+#include "graph_transformer_tests.hpp"
+
#include <vpu/stages/stub_stage.hpp>
+#include <vpu/model/data_contents/ie_blob_content.hpp>
-#include "graph_transformer_tests.hpp"
+#include <initializer_list>
using namespace vpu;
#include <vpu/stages/stub_stage.hpp>
#include "graph_transformer_tests.hpp"
+#include "vpu/model/data_contents/ie_blob_content.hpp"
using namespace vpu;
ASSERT_EQ(2, cnnNet.getBatchSize());
ASSERT_EQ(2, cnnNet.getCNNNetwork()->getBatchSize());
-
- auto cnnNet2 = cnnNet.cloneNGraphImpl();
-
- ASSERT_EQ(2, cnnNet2->getBatchSize());
- ASSERT_EQ(2, cnnNet2->getCNNNetwork()->getBatchSize());
- ASSERT_NE(cnnRefNet, cnnNet2->getCNNNetwork());
}
TEST_F(CNNNGraphImplTests, TestSaveAffinity) {
#include "tests_common.hpp"
#include <convert_function_to_cnn_network.hpp>
+#include <cpp/ie_cnn_network.h>
#include <ngraph/function.hpp>
#include <ngraph/opsets/opset1.hpp>
ngraph::ParameterVector{param1, param2});
}
- InferenceEngine::details::CNNNetworkNGraphImpl nGraphImpl(f);
+ InferenceEngine::CNNNetwork nGraphImpl(f);
try {
auto net = InferenceEngine::details::convertFunctionToICNNNetwork(f, nGraphImpl);
FAIL();
ngraph::ParameterVector{param1, param2});
}
- InferenceEngine::details::CNNNetworkNGraphImpl nGraphImpl(f);
+ InferenceEngine::CNNNetwork nGraphImpl(f);
try {
auto net = InferenceEngine::details::convertFunctionToICNNNetwork(f, nGraphImpl);
} catch (InferenceEngine::details::InferenceEngineException &err) {
FAIL();
}
-}
\ No newline at end of file
+}
)_";
TEST(NetworkSerializerTest, TopoSortResultUnique) {
-
- auto reader = std::shared_ptr<InferenceEngine::ICNNNetReader>(InferenceEngine::CreateCNNNetReader());
+ auto reader = InferenceEngine::CreateCNNNetReaderPtr();
InferenceEngine::ResponseDesc resp;
+++ /dev/null
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include <gtest/gtest.h>
-#include "cpp/ie_cnn_net_reader.h"
-
-using namespace InferenceEngine;
-
-class PointerTests : public ::testing::Test {};
-
-TEST_F(PointerTests, InferenceEnginePtrStoresValues) {
- std::shared_ptr <ICNNNetReader> p(InferenceEngine::CreateCNNNetReader());
- ASSERT_NE(p.get(), nullptr);
-}
string testContent = getNetworkWithConvLayer("Q78", { 1, 1, 3, 227, 227 });
xmlHelper->loadContent(testContent);
- EXPECT_THROW(xmlHelper->parse(), InferenceEngine::details::InferenceEngineException);
+ // TODO: fix RTTI issue and replace by InferenceEngine::details::InferenceEngineException
+ EXPECT_THROW(xmlHelper->parse(), std::exception);
}
//convolution input must be 4D
string testContent = getNetworkWithConvLayer("Q78", { 227, 227 });
xmlHelper->loadContent(testContent);
- EXPECT_THROW(xmlHelper->parse(), InferenceEngine::details::InferenceEngineException);
+ // TODO: fix RTTI issue and replace by InferenceEngine::details::InferenceEngineException
+ EXPECT_THROW(xmlHelper->parse(), std::exception);
}
//pooling input must be 4D
TEST_F(V2TopologyVerificationTests, testCheckPoolingInputDim_Less) {
string testContent = getNetworkWithPoolLayer({ 227, 227 });
xmlHelper->loadContent(testContent);
- EXPECT_THROW(xmlHelper->parse(), InferenceEngine::details::InferenceEngineException);
+ // TODO: fix RTTI issue and replace by InferenceEngine::details::InferenceEngineException
+ EXPECT_THROW(xmlHelper->parse(), std::exception);
}
//pooling input must be 4D
TEST_F(V2TopologyVerificationTests, testCheckPoolingInputDim_More) {
string testContent = getNetworkWithPoolLayer({ 1, 1, 3, 227, 227 });
xmlHelper->loadContent(testContent);
- EXPECT_THROW(xmlHelper->parse(), InferenceEngine::details::InferenceEngineException);
+ // TODO: fix RTTI issue and replace by InferenceEngine::details::InferenceEngineException
+ EXPECT_THROW(xmlHelper->parse(), std::exception);
}
TEST_F(V2TopologyVerificationTests, testLeayerPrecisionIsNotMIXED) {
string testContent = getNetworkWithConvLayer("MIXED");
xmlHelper->loadContent(testContent);
- EXPECT_THROW(xmlHelper->parse(), InferenceEngine::details::InferenceEngineException);
+ // TODO: fix RTTI issue and replace by InferenceEngine::details::InferenceEngineException
+ EXPECT_THROW(xmlHelper->parse(), std::exception);
}
TEST_F(V2TopologyVerificationTests, testMixedPrecisionIfLayerAndNetworkPrecisionsDiffer) {
string testContent = getNetworkWithCropLayer({ data });
xmlHelper->loadContent(testContent);
- ASSERT_THROW(xmlHelper->parse(), InferenceEngine::details::InferenceEngineException);
+
+ // TODO: fix RTTI issue and replace by InferenceEngine::details::InferenceEngineException
+ ASSERT_THROW(xmlHelper->parse(), std::exception);
}
TEST_F(V2TopologyVerificationTests, testNoThrowWithProperCropParameters) {
} else if (this->format == cldnn::format::os_is_yx_osv32_isv32p) {
sizes[0] = align_to(sizes[0], 32);
sizes[1] = align_to(sizes[1], 32);
+ } else if (this->format == cldnn::format::image_2d_rgba) {
+ sizes[1] = 4;
}
size_t total = std::accumulate(
sizes.begin(),
b_fs_yx_32fp, ///< format for data for binary convolutions
winograd_2x3_s1_data, ///< format used for input for winograd convolution, F(2,3) -- filter 3x3 with stride 1
nv12, ///< format for media nv12 input
+ image_2d_rgba, ///< format for image2d RGBA, always allocates memory for 4 feature maps (even when only 3 are used)
// Weights formats
oiyx, ///< the most common format for 2D weights
yxio, ///< format used 2D weights
oizyx, ///< the most common format for 3D convolution
os_iyx_osv16, ///< format used only for convolution weights:
+ os_is_yx_osv16_isv16, ///< format used for convolution i8 weights
os_zyxi_osv16, ///< format used for weights for 3D convolution
os_is_yx_isv16_osv16, ///< format used for blocked convolution
os_is_zyx_isv16_osv16, ///< format used for weights for blocked 3D convolution
gs_oiyx_gsv16, ///< format used for weights for 2D convolution
gs_oiyx_gsv32, ///< format used for weights for 2D convolution
g_is_os_zyx_osv16_isv16, ///< format used for grouped weights for blocked 3D deconvolution
+ g_os_is_yx_osv16_isv4,
g_is_os_yx_osv16_isv16,
g_os_is_zyx_isv8_osv16_isv2,
g_os_is_yx_isv8_osv16_isv2,
{ bs_fs_zyx_bsv16_fsv16, { 1, 1, 3, 0, 0, "bfzyx", "bfxyz", {{0, 16 }, {1, 16}}}},
{ bs_fs_yx_bsv16_fsv16, { 1, 1, 3, 0, 0, "bfyx", "bfxy?", {{0, 16 }, {1, 16}}}},
{ nv12, { 1, 1, 2, 0, 0, "bfyx", "bfxy?", {}}},
+ { image_2d_rgba, { 1, 1, 2, 0, 0, "bfyx", "bfxy?", {}}},
{ oiyx, { 1, 1, 2, 0, 0, "bfyx", "bfxy", {}}},
{ yxio, { 1, 1, 2, 0, 0, "yxfb", "bfxy?", {}}},
{ os_is_zyx_isv8_osv16_isv2, { 1, 1, 3, 0, 0, "bfzyx", "bfxyz", {{1, 8}, {0, 16}, {1, 2}}}},
{ os_zyxi_osv16, { 1, 1, 3, 0, 0, "bzyxf", "bfxyz", {{0, 16}}}},
{ os_is_yx_isv8_osv16_isv2, { 1, 1, 2, 0, 0, "bfzyx", "bfxyz", {{1, 8}, {0, 16}, {1, 2}}}},
+ { os_is_yx_osv16_isv16, { 1, 1, 2, 0, 0, "bfyx", "bfxy", {{1, 16}, {0, 16}}}},
{ goiyx, { 1, 1, 2, 0, 1, "gbfyx", "bfxy????g", {}}},
{ goizyx, { 1, 1, 3, 0, 1, "gbfzyx", "bfxyz???g", {}}},
{ g_is_os_yx_osv16_isv16, { 1, 1, 2, 0, 1, "gfbyx", "bfxy????g", {{0, 16}, {1, 16}}}},
{ g_os_is_zyx_isv8_osv16_isv2, { 1, 1, 3, 0, 1, "gbfzyx", "bfxyz???g", {{1, 8}, {0, 16}, {1, 2}}}},
{ g_os_is_yx_isv8_osv16_isv2, { 1, 1, 2, 0, 1, "gbfyx", "bfxy????g", {{1, 8}, {0, 16}, {1, 2}}}},
- { g_os_is_zyx_isv16_osv16, { 1, 1, 3, 0, 1, "bfzyx", "bfxyz???g", {{0, 16}, {1, 16}}}},
+ { g_os_is_zyx_isv16_osv16, { 1, 1, 3, 0, 1, "gbfzyx", "bfxyz???g", {{0, 16}, {1, 16}}}},
+ { g_os_is_yx_osv16_isv4, { 1, 1, 2, 0, 1, "gbfxy", "bfxy????g", {{0, 16}, {1, 4}}}},
};
return traits.at(fmt);
}
fmt == image_2d_weights_c1_b_fyx ||
fmt == image_2d_weights_winograd_6x3_s1_fbxyb ||
fmt == image_2d_weights_winograd_6x3_s1_xfbyb ||
- fmt == nv12);
+ fmt == nv12 ||
+ fmt == image_2d_rgba);
}
/// @brief Checks if @p format is of grouped type
static bool is_grouped(type fmt) { return group_num(fmt) != 0; }
/// @brief Is optimization that output contains data from second input ON ?
bool second_input_in_output = false;
+ bool depth_to_space_already_fused = false;
protected:
const primitive_id_arr conv_weights;
return detail::errHandler(CL_INVALID_ARG_VALUE, fname);
}
- static PFN_clGetDeviceIDsFromMediaAdapterINTEL pfn_clGetDeviceIDsFromMediaAdapterINTEL = NULL;
+ PFN_clGetDeviceIDsFromMediaAdapterINTEL pfn_clGetDeviceIDsFromMediaAdapterINTEL = NULL;
if (!pfn_clGetDeviceIDsFromMediaAdapterINTEL) {
pfn_clGetDeviceIDsFromMediaAdapterINTEL =
reinterpret_cast<PFN_clGetDeviceIDsFromMediaAdapterINTEL>
0,
NULL,
&n);
- if (err != CL_SUCCESS) {
+ if (err != CL_SUCCESS && err != CL_DEVICE_NOT_FOUND) {
return detail::errHandler(err, fname);
}
- vector<cl_device_id> ids(n);
- err = pfn_clGetDeviceIDsFromMediaAdapterINTEL(
- object_,
- media_adapter_type,
- media_adapter,
- media_adapter_set,
- n,
- ids.data(),
- NULL);
- if (err != CL_SUCCESS) {
- return detail::errHandler(err, fname);
- }
+ if (err != CL_DEVICE_NOT_FOUND)
+ {
+ vector<cl_device_id> ids(n);
+ err = pfn_clGetDeviceIDsFromMediaAdapterINTEL(
+ object_,
+ media_adapter_type,
+ media_adapter,
+ media_adapter_set,
+ n,
+ ids.data(),
+ NULL);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, fname);
+ }
- // Cannot trivially assign because we need to capture intermediates
- // with safe construction
- // We must retain things we obtain from the API to avoid releasing
- // API-owned objects.
- if (devices) {
- devices->resize(ids.size());
-
- // Assign to param, constructing with retain behaviour
- // to correctly capture each underlying CL object
- for (size_type i = 0; i < ids.size(); i++) {
- (*devices)[i] = Device(ids[i], true);
+ // Cannot trivially assign because we need to capture intermediates
+ // with safe construction
+ // We must retain things we obtain from the API to avoid releasing
+ // API-owned objects.
+ if (devices) {
+ devices->resize(ids.size());
+
+ // Assign to param, constructing with retain behaviour
+ // to correctly capture each underlying CL object
+ for (size_type i = 0; i < ids.size(); i++) {
+ (*devices)[i] = Device(ids[i], true);
+ }
}
- }
- // set up acquire/release extensions
- SharedSurfLock::Init(object_);
- ImageVA::Init(object_);
+ // set up acquire/release extensions
+ SharedSurfLock::Init(object_);
+ ImageVA::Init(object_);
#ifdef WIN32
- BufferDX::Init(object_);
+ BufferDX::Init(object_);
#endif
-
+ }
return CL_SUCCESS;
}
};
{ DataLayout::b_fs_yx_32fp, { 0, 1, -1, -1, 2, 3 } },
{ DataLayout::bfwzyx, { 0, 1, 2, 3, 4, 5 } },
{ DataLayout::nv12, { 0, 1, -1, -1, 2, 3 } },
+ { DataLayout::image_2d_rgba, { 0, 1, -1, -1, 2, 3 } },
}};
WeightsTensor::WeightsChannelArray WeightsTensor::weightsChannelArray {{
{ WeightsLayout::os_i_osv8__ai8, { -1, -1, -1, 0, 1, -1, -1, -1 } },
{ WeightsLayout::os_i_osv16__ai8, { -1, -1, -1, 0, 1, -1, -1, -1 } },
{ WeightsLayout::os_i_osv16, { -1, -1, -1, 0, 1, -1, -1, -1 } },
+ { WeightsLayout::os_is_yx_osv16_isv16, { 0, 1, -1, 2, 3, -1, -1, -1 } },
{ WeightsLayout::i_yxs_os_yxsv2_osv16, { 1, 2, -1, 3, 0, -1, -1, -1 } },
{ WeightsLayout::iy_xs_os_xsv2_osv16__ao32, { 1, 2, -1, 3, 0, -1, -1, -1 } },
{ WeightsLayout::iy_xs_os_xsv2_osv8__ao32, { 1, 2, -1, 3, 0, -1, -1, -1 } },
{ WeightsLayout::os_is_yx_isv8_osv16_isv2, { 0, 1, -1, 2, 3, -1, -1, -1 } },
{ WeightsLayout::os_zyxi_osv16, { 1, 2, 3, 0, 4, -1, -1, -1 } },
{ WeightsLayout::os_i_yxs_osv4_yxsv4, { 0, 1, -1, 2, 3, -1, -1, -1 } },
+ { WeightsLayout::is_os_yx_osv16_isv16, { 0, 1, -1, 3, 2, -1, -1, -1 } },
{ WeightsLayout::goiyx, { 0, 1, -1, 2, 3, -1, -1, 4 } },
{ WeightsLayout::goizyx, { 0, 1, 2, 3, 4, -1, -1, 5 } },
{ WeightsLayout::g_os_iyx_osv16, { 0, 1, -1, 2, 3, -1, -1, 4 } },
{ WeightsLayout::g_os_is_zyx_isv16_osv16, { 0, 1, 2, 3, 4, -1, -1, 5 } },
{ WeightsLayout::giy_xs_os_xsv2_osv16__ao32, { 1, 2, -1, 3, 0, -1, -1, 4 } },
{ WeightsLayout::giy_xs_os_xsv2_osv8__ao32, { 1, 2, -1, 3, 0, -1, -1, 4 } },
- { WeightsLayout::gs_oi_yxs_gsv4_yxsv4, { 0, 1, -1, 2, 3, -1, -1, 4 } },
{ WeightsLayout::g_os_is_yx_isv16_osv16, { 0, 1, -1, 2, 3, -1, -1, 4 } },
+ { WeightsLayout::gs_oi_yxs_gsv4_yxsv4, { 0, 1, -1, 2, 3, -1, -1, 4 } },
+ { WeightsLayout::gs_oi_yxs_gsv16_yxsv4, { 0, 1, -1, 2, 3, -1, -1, 4 } },
+ { WeightsLayout::gs_oi_yxs_gsv32_yxsv4, { 0, 1, -1, 2, 3, -1, -1, 4 } },
+ { WeightsLayout::g_os_is_yx_osv16_isv4, { 0, 1, -1, 2, 3, -1, -1, 4 } },
}};
NDims DataTensor::GetSimpleDims(const std::vector<size_t>& d, DataLayout l) {
case gs_oi_yxs_gsv4_yxsv4:
newDims[4] = RoundUp(newDims[4], 4);
break;
+ case os_is_yx_osv16_isv16:
+ assert(newDims.size() == 4);
+ newDims[2] = RoundUp(newDims[2], 16);
+ newDims[3] = RoundUp(newDims[3], 16);
+ break;
+ case gs_oi_yxs_gsv16_yxsv4:
+ newDims[4] = RoundUp(newDims[4], 16);
+ break;
+ case gs_oi_yxs_gsv32_yxsv4:
+ newDims[4] = RoundUp(newDims[4], 32);
+ break;
+ case g_os_is_yx_osv16_isv4:
+ assert(newDims.size() == 5);
+ newDims[2] = RoundUp(newDims[2], 4);
+ newDims[3] = RoundUp(newDims[3], 16);
+ break;
default:
break;
}
} else if (l == gs_oi_yxs_gsv4_yxsv4) {
ret[2].pitch = RoundUp(ret[0].v * ret[1].v, 4) * 4;
ret[4].pitch = ret[3].v * RoundUp(ret[0].v * ret[1].v, 4);
+ } else if (l == gs_oi_yxs_gsv16_yxsv4) {
+ ret[2].pitch = RoundUp(ret[0].v * ret[1].v, 4) * 16;
+ ret[4].pitch = ret[3].v * RoundUp(ret[0].v * ret[1].v, 4);
+ } else if (l == gs_oi_yxs_gsv32_yxsv4) {
+ ret[2].pitch = RoundUp(ret[0].v * ret[1].v, 4) * 32;
+ ret[4].pitch = ret[3].v * RoundUp(ret[0].v * ret[1].v, 4);
}
-
return ret;
}
b_fs_yx_32fp, // bfyx with blocks of 16 packed binary input channels
bfwzyx, // batch, feature, 4D spatial
nv12, // media nv12 layout
+ image_2d_rgba, // image2d RGBA
DataLayoutCount // NUMBER OF ELEMENTS IN ENUM
};
os_i_osv8__ai8, // TODO can we drop the alignment form layout name?
os_i_osv16__ai8,
os_i_osv16,
+ os_is_yx_osv16_isv16, // wieghts for int8 blocked conv
i_yxs_os_yxsv2_osv16,
iy_xs_os_xsv2_osv16__ao32,
iy_xs_os_xsv2_osv8__ao32,
g_os_is_zyx_isv16_osv16,
giy_xs_os_xsv2_osv16__ao32,
giy_xs_os_xsv2_osv8__ao32,
- gs_oi_yxs_gsv4_yxsv4, // grouped weights for depthwise IMAD convolution
g_os_is_yx_isv16_osv16,
+ gs_oi_yxs_gsv4_yxsv4, // grouped weights for depthwise IMAD convolution (b_fs_yx_fsv4 format)
+ gs_oi_yxs_gsv16_yxsv4, // grouped weights for depthwise IMAD convolution (b_fs_yx_fsv16 format)
+ gs_oi_yxs_gsv32_yxsv4, // grouped weights for depthwise IMAD convolution (b_fs_yx_fsv32 format)
+
+ g_os_is_yx_osv16_isv4,
WeightsLayoutCount // NUMBER OF ELEMENTS IN ENUM
};
case WeightsLayout::giy_xs_os_xsv2_osv16__ao32:
case WeightsLayout::giy_xs_os_xsv2_osv8__ao32:
case WeightsLayout::gs_oi_yxs_gsv4_yxsv4:
+ case WeightsLayout::g_os_is_yx_osv16_isv4:
return true;
default:
return false;
k.EnableConcatAxis(ConcatAxis::FEATURE);
k.EnableConcatAxis(ConcatAxis::BATCH);
k.EnableConcatKernelPerInput();
+ k.EnableDifferentTypes();
return k;
}
--- /dev/null
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "convolution_kernel_b_fs_yx_fsv16_imad_1x1.h"
+#include "kernel_selector_utils.h"
+#include "common_tools.h"
+#include <vector>
+#include <iostream>
+#include <algorithm>
+
+//
+// Kernel specific constants
+//
+#define SIMD_SIZE 16
+
+namespace kernel_selector {
+
+namespace {
+
+size_t getOutBlock_X(size_t output_size_x, size_t stride_x) {
+ size_t output_block_width = 0;
+ size_t max_block_size = std::min((SIMD_SIZE - 1) / stride_x + 1, output_size_x);
+
+ if (output_size_x <= max_block_size)
+ return output_size_x;
+
+ for (size_t block = 4; block <= max_block_size; ++block) {
+ if (output_size_x % block == 0)
+ output_block_width = block;
+ }
+ if (output_block_width == 0 && output_size_x < max_block_size * 3) {
+ size_t min_overhang = max_block_size;
+ for (size_t block = 4; block <= max_block_size; ++block) {
+ size_t overhang = block - output_size_x % block;
+ if (overhang <= min_overhang) {
+ min_overhang = overhang;
+ output_block_width = block;
+ }
+ }
+ }
+
+ if (output_block_width == 0) {
+ output_block_width = max_block_size;
+ }
+ return output_block_width;
+}
+
+bool should_k_slice(const convolution_params& params, size_t output_block_width) {
+ constexpr float preferred_eu_occupancy = 5.f;
+ if (params.inputs[0].Feature().v % (16 * 4) != 0)
+ return false;
+
+ size_t eu_count = params.engineInfo.computeUnitsCount;
+ auto global_size = CeilDiv(params.output.X().v, output_block_width) *
+ params.output.Y().v *
+ params.output.Batch().v * Align(CeilDiv(params.output.Feature().v, 2), SIMD_SIZE);
+ auto threads = global_size / SIMD_SIZE;
+ auto optimal_threads_num = eu_count * preferred_eu_occupancy;
+ return threads < optimal_threads_num;
+}
+
+} // namespace
+
+Convolution_kernel_b_fs_yx_fsv16_imad_1x1::Convolution_kernel_b_fs_yx_fsv16_imad_1x1()
+ : ConvolutionKernelBase("convolution_gpu_b_fs_yx_fsv16_imad_1x1") {
+ for (size_t bw = 1; bw <= SIMD_SIZE; ++bw) {
+ for (auto exe : ConvolutionKernelBase::autoTuneOptions) {
+ all_tune_params.push_back(AutoTuneParams{ bw, true, exe });
+ all_tune_params.push_back(AutoTuneParams{ bw, false, exe });
+ }
+ }
+}
+
+ParamsKey Convolution_kernel_b_fs_yx_fsv16_imad_1x1::GetSupportedKey() const {
+ ParamsKey k;
+ k.EnableInputDataType(Datatype::INT8);
+ k.EnableInputDataType(Datatype::UINT8);
+
+ k.EnableOutputDataType(Datatype::INT8);
+ k.EnableOutputDataType(Datatype::UINT8);
+ k.EnableOutputDataType(Datatype::F32);
+ k.EnableOutputDataType(Datatype::F16);
+
+ k.EnableInputWeightsType(WeightsType::INT8);
+
+ k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
+ k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
+
+ k.EnableDifferentTypes();
+ k.EnableDifferentInputWeightsTypes();
+ k.EnableTensorOffset();
+ k.EnableTensorPitches();
+ k.EnableBiasPerFeature();
+ k.EnableNonBiasTerm();
+ k.EnableBatching();
+ k.EnableQuantization(QuantizationType::SYMMETRIC);
+ return k;
+}
+
+JitConstants Convolution_kernel_b_fs_yx_fsv16_imad_1x1::GetJitConstants(const convolution_params& params,
+ const DispatchData& kd) const {
+ auto mem_consts = Parent::GetJitConstants(params, kd);
+ mem_consts.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", kd.cldnnStyle.blockWidth));
+ mem_consts.AddConstant(MakeJitConstant("FEATURE_LWS_SPLIT", kd.cldnnStyle.prefetch));
+
+ if (!params.fused_ops.empty()) {
+ auto input_dt = GetActivationType(params);
+ FusedOpsConfiguration conf_scalar = {"", {"out_b", "out_f + out_f_offset", "out_y", "out_x + i"}, "dequantized", input_dt, 1 };
+ conf_scalar.SetLoopAxes({ Tensor::DataChannelName::X }, true);
+ mem_consts.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
+ }
+
+ return mem_consts;
+} // GetJitConstants
+
+ConvolutionKernelBase::DispatchData Convolution_kernel_b_fs_yx_fsv16_imad_1x1::SetDefault(const convolution_params& params,
+ int index) const {
+ DispatchData kd;
+ const auto& output = params.output;
+ auto tune_params = GetAutoTuneParams(params, index);
+ size_t k_slices = tune_params.k_slicing ? 4 : 1;
+
+ kd.gws0 = CeilDiv(output.X().v, tune_params.out_block_width);
+ kd.gws1 = output.Y().v;
+ kd.gws2 = output.Batch().v * Align(CeilDiv(output.Feature().v, 2), SIMD_SIZE) * k_slices;
+
+ kd.lws0 = 1;
+ kd.lws1 = 1;
+ kd.lws2 = SIMD_SIZE * k_slices;
+
+ kd.cldnnStyle = {0, 0, 0, 0, 0};
+ kd.gemmStyle = {0, 0, 0, 0, 0, 0};
+
+ kd.cldnnStyle.blockWidth = tune_params.out_block_width;
+ kd.cldnnStyle.prefetch = k_slices;
+
+ kd.efficiency = FORCE_PRIORITY_2;
+
+ return kd;
+} // SetDefault
+
+bool Convolution_kernel_b_fs_yx_fsv16_imad_1x1::Validate(const Params& params, const optional_params& options) const {
+ if (!Parent::Validate(params, options)) {
+ return false;
+ }
+
+ KernelData kd = KernelData::Default<convolution_params>(params);
+ convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
+
+ if ((newParams.filterSize.x != newParams.filterSize.y) ||
+ newParams.filterSize.x != 1) {
+ // Fitler size needs to be 1x1
+ return false;
+ }
+
+ if ((newParams.stride.x != newParams.stride.y) ||
+ (newParams.stride.x != 1 && newParams.stride.x != 2)) {
+ // Strides must be 1x1 or 2x2
+ return false;
+ }
+
+ if (newParams.groups != 1 || newParams.split != 1)
+ return false;
+
+ return true;
+}
+
+Convolution_kernel_b_fs_yx_fsv16_imad_1x1::AutoTuneParams
+Convolution_kernel_b_fs_yx_fsv16_imad_1x1::GetAutoTuneParams(const convolution_params& params, int index) const {
+ if (index >= 0 && index < static_cast<int>(all_tune_params.size())) {
+ return all_tune_params[index];
+ }
+ AutoTuneParams default_params;
+ default_params.out_block_width = getOutBlock_X(params.output.X().v, params.stride.x);
+ default_params.k_slicing = should_k_slice(params, default_params.out_block_width);
+ default_params.exe_mode = DEFAULT;
+ return default_params;
+}
+
+bool Convolution_kernel_b_fs_yx_fsv16_imad_1x1::ValidateAutoTuneParams(const convolution_params& params,
+ const AutoTuneParams& tune_params) const {
+ if (tune_params.k_slicing && params.inputs[0].Feature().v % (16 * 4) != 0)
+ return false;
+
+ size_t max_block_size = std::min(static_cast<size_t>((SIMD_SIZE - 1) / params.stride.x + 1), params.output.X().v);
+ if (tune_params.out_block_width > max_block_size)
+ return false;
+
+ return true;
+}
+
+KernelsData Convolution_kernel_b_fs_yx_fsv16_imad_1x1::GetKernelsData(const Params& params,
+ const optional_params& options) const {
+ return GetTunedKernelsDataByIndex(params, options);
+}
+
+KernelsData Convolution_kernel_b_fs_yx_fsv16_imad_1x1::GetTunedKernelsDataByIndex(const Params & params,
+ const optional_params & options,
+ int autoTuneIndex) const {
+ auto conv_params = static_cast<const convolution_params&>(params);
+ auto tune_params = GetAutoTuneParams(conv_params, autoTuneIndex);
+ if (!ValidateAutoTuneParams(conv_params, tune_params))
+ return {};
+ return GetCommonKernelsData(params, options, tune_params.exe_mode, autoTuneIndex);
+}
+
+KernelsData Convolution_kernel_b_fs_yx_fsv16_imad_1x1::GetKernelsDataForAutoTune(const Params & params,
+ const optional_params & options) const {
+ if (!Validate(params, options)) {
+ return {};
+ }
+ auto& conv_params = static_cast<const convolution_params&>(params);
+
+ KernelsData res = {};
+
+ for (size_t i = 0; i < all_tune_params.size(); i++) {
+ auto tune_params = GetAutoTuneParams(conv_params, static_cast<int>(i));
+ if (!ValidateAutoTuneParams(conv_params, tune_params))
+ continue;
+ KernelsData kd = GetTunedKernelsDataByIndex(params, options, static_cast<int>(i));
+ if (!kd.empty()) {
+ res.emplace_back(kd[0]);
+ }
+ }
+
+ return res;
+}
+
+} // namespace kernel_selector
--- /dev/null
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include "convolution_kernel_base.h"
+#include <vector>
+#include <string>
+
+namespace kernel_selector {
+
+class Convolution_kernel_b_fs_yx_fsv16_imad_1x1 : public ConvolutionKernelBase {
+public:
+ using Parent = ConvolutionKernelBase;
+ Convolution_kernel_b_fs_yx_fsv16_imad_1x1();
+ virtual ~Convolution_kernel_b_fs_yx_fsv16_imad_1x1() {}
+
+ KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+ KernelsData GetKernelsDataForAutoTune(const Params & params, const optional_params & options) const override;
+ KernelsData GetTunedKernelsDataByIndex(const Params & params, const optional_params & options, int autoTuneIndex = -1) const override;
+ ParamsKey GetSupportedKey() const override;
+
+protected:
+ bool Validate(const Params& params, const optional_params& options) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
+ bool NeedPaddedInput() const override { return true; }
+ WeightsLayout GetPreferredWeightsLayout(const convolution_params&) const override {
+ return WeightsLayout::os_is_yx_osv16_isv16;
+ }
+
+ std::vector<FusedOpType> GetSupportedFusedOps() const override {
+ return { FusedOpType::ELTWISE,
+ FusedOpType::QUANTIZE,
+ FusedOpType::SCALE,
+ FusedOpType::ACTIVATION };
+ }
+
+ struct AutoTuneParams {
+ size_t out_block_width;
+ bool k_slicing;
+ std::string exe_mode;
+ };
+ std::vector<AutoTuneParams> all_tune_params;
+
+ bool ValidateAutoTuneParams(const convolution_params& params, const AutoTuneParams& tune_params) const;
+ AutoTuneParams GetAutoTuneParams(const convolution_params& params, int index) const;
+};
+} // namespace kernel_selector
--- /dev/null
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "convolution_kernel_b_fs_yx_fsv16_imad_3x3.h"
+#include "kernel_selector_utils.h"
+#include "common_tools.h"
+#include <vector>
+#include <iostream>
+
+//
+// Kernel specific constants
+//
+#define SIMD_SIZE 16
+
+static size_t getOutBlock_X(const size_t output_size_x, const size_t stride_x, const size_t filter_size_x) {
+ size_t output_block_width = 0;
+ size_t max_block_size = std::min((SIMD_SIZE - filter_size_x) / stride_x + 1, output_size_x);
+
+ if (output_size_x <= max_block_size)
+ return output_size_x;
+
+ for (size_t block = 4; block <= max_block_size; ++block) {
+ if (output_size_x % block == 0)
+ output_block_width = block;
+ }
+ if (output_block_width == 0 && output_size_x < max_block_size * 3) {
+ size_t min_overhang = max_block_size;
+ for (size_t block = 4; block <= max_block_size; ++block) {
+ size_t overhang = block - output_size_x % block;
+ if (overhang <= min_overhang) {
+ min_overhang = overhang;
+ output_block_width = block;
+ }
+ }
+ }
+
+ if (output_block_width == 0) {
+ output_block_width = max_block_size;
+ }
+ return output_block_width;
+}
+
+static size_t get_ofm_per_wi(const size_t output_size_f) {
+ if (output_size_f % 32 == 0)
+ return 2;
+ return 1;
+}
+
+namespace kernel_selector {
+
+ParamsKey Convolution_kernel_b_fs_yx_fsv16_imad_3x3::GetSupportedKey() const {
+ ParamsKey k;
+ k.EnableInputDataType(Datatype::INT8);
+ k.EnableInputDataType(Datatype::UINT8);
+
+ k.EnableOutputDataType(Datatype::INT8);
+ k.EnableOutputDataType(Datatype::UINT8);
+ k.EnableOutputDataType(Datatype::F32);
+ k.EnableOutputDataType(Datatype::F16);
+
+ k.EnableInputWeightsType(WeightsType::INT8);
+
+ k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
+ k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
+
+ k.EnableDifferentTypes();
+ k.EnableDifferentInputWeightsTypes();
+ k.EnableTensorOffset();
+ k.EnableTensorPitches();
+ k.EnableBiasPerFeature();
+ k.EnableNonBiasTerm();
+ k.EnableBatching();
+ k.EnableQuantization(QuantizationType::SYMMETRIC);
+ k.DisableTuning();
+ return k;
+}
+
+KernelsData Convolution_kernel_b_fs_yx_fsv16_imad_3x3::GetKernelsData(const Params& params,
+ const optional_params& options) const {
+ return GetCommonKernelsData(params, options);
+}
+
+JitConstants Convolution_kernel_b_fs_yx_fsv16_imad_3x3::GetJitConstants(const convolution_params& params,
+ const DispatchData& kd) const {
+ auto mem_consts = Parent::GetJitConstants(params, kd);
+ const auto& output = params.output;
+
+ mem_consts.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", getOutBlock_X(output.X().v, params.stride.x, params.filterSize.x)));
+ mem_consts.AddConstant(MakeJitConstant("OFM_BLOCKS_PER_SIMD", get_ofm_per_wi(output.Feature().v)));
+ mem_consts.AddConstant(MakeJitConstant("OFM_SIZE_PER_SIMD", SIMD_SIZE * get_ofm_per_wi(output.Feature().v)));
+
+ if (!params.fused_ops.empty()) {
+ auto input_dt = GetActivationType(params);
+ FusedOpsConfiguration conf_scalar = {"", {"out_b", "out_f + j * 16", "out_y", "out_x + i"}, "dequantized", input_dt, 1};
+ conf_scalar.SetLoopAxes({ Tensor::DataChannelName::X }, true);
+ mem_consts.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
+ }
+
+ return mem_consts;
+} // GetJitConstants
+
+ConvolutionKernelBase::DispatchData Convolution_kernel_b_fs_yx_fsv16_imad_3x3::SetDefault(const convolution_params& params,
+ int) const {
+ DispatchData kd;
+ const auto& output = params.output;
+ auto output_block_width = getOutBlock_X(output.X().v, params.stride.x, params.filterSize.x);
+ auto ofm_blocks_per_simd = get_ofm_per_wi(output.Feature().v);
+
+ kd.gws0 = CeilDiv(output.X().v, output_block_width);
+ kd.gws1 = output.Y().v;
+ kd.gws2 = output.Batch().v * Align(output.Feature().v / ofm_blocks_per_simd, SIMD_SIZE);
+
+ kd.lws0 = 1;
+ kd.lws1 = 1;
+ kd.lws2 = SIMD_SIZE;
+
+ kd.cldnnStyle = {0, 0, 0, 0, 0};
+ kd.gemmStyle = {0, 0, 0, 0, 0, 0};
+
+ if (params.filterSize.x == 3)
+ kd.efficiency = FORCE_PRIORITY_2;
+ else
+ kd.efficiency = FORCE_PRIORITY_5;
+
+ return kd;
+} // SetDefault
+
+bool Convolution_kernel_b_fs_yx_fsv16_imad_3x3::Validate(const Params& params, const optional_params& options) const {
+ if (!Parent::Validate(params, options)) {
+ return false;
+ }
+
+ KernelData kd = KernelData::Default<convolution_params>(params);
+ convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
+
+ if ((newParams.filterSize.x != newParams.filterSize.y) ||
+ (newParams.filterSize.x != 3 && newParams.filterSize.x != 5)) {
+ // Fitler size needs to be 3x3 or 5x5
+ return false;
+ }
+
+ if ((newParams.stride.x != newParams.stride.y) ||
+ (newParams.stride.x != 1 && newParams.stride.x != 2)) {
+ // Strides must be 1x1 or 2x2
+ return false;
+ }
+
+ if (newParams.groups != 1 || newParams.split != 1)
+ return false;
+
+ return true;
+}
+} // namespace kernel_selector
--- /dev/null
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include "convolution_kernel_base.h"
+#include <vector>
+
+namespace kernel_selector {
+
+class Convolution_kernel_b_fs_yx_fsv16_imad_3x3 : public ConvolutionKernelBase {
+public:
+ using Parent = ConvolutionKernelBase;
+ Convolution_kernel_b_fs_yx_fsv16_imad_3x3() : ConvolutionKernelBase("convolution_gpu_b_fs_yx_fsv16_imad_3x3") {}
+ virtual ~Convolution_kernel_b_fs_yx_fsv16_imad_3x3() {}
+
+ KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+ ParamsKey GetSupportedKey() const override;
+
+protected:
+ bool Validate(const Params& params, const optional_params& options) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
+ bool NeedPaddedInput() const override { return true; }
+ WeightsLayout GetPreferredWeightsLayout(const convolution_params&) const override {
+ return WeightsLayout::os_is_yx_osv16_isv16;
+ }
+
+ std::vector<FusedOpType> GetSupportedFusedOps() const override {
+ return { FusedOpType::ELTWISE,
+ FusedOpType::QUANTIZE,
+ FusedOpType::SCALE,
+ FusedOpType::ACTIVATION };
+ }
+};
+} // namespace kernel_selector
--- /dev/null
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks.h"
+#include "kernel_selector_utils.h"
+#include "common_tools.h"
+#include <vector>
+#include <iostream>
+
+//
+// Kernel specific constants
+//
+#define SIMD_SIZE 16
+
+static size_t getOutBlock_X(size_t output_size_x) {
+ auto output_block_width = 7;
+ if (output_size_x % 8 == 0)
+ output_block_width = 8;
+ return output_block_width;
+}
+
+
+namespace kernel_selector {
+
+ParamsKey Convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks::GetSupportedKey() const {
+ ParamsKey k;
+ k.EnableInputDataType(Datatype::INT8);
+ k.EnableInputDataType(Datatype::UINT8);
+
+ k.EnableOutputDataType(Datatype::INT8);
+ k.EnableOutputDataType(Datatype::UINT8);
+ k.EnableOutputDataType(Datatype::F32);
+ k.EnableOutputDataType(Datatype::F16);
+
+ k.EnableInputWeightsType(WeightsType::INT8);
+
+ k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
+ k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
+
+ k.EnableDifferentTypes();
+ k.EnableDifferentInputWeightsTypes();
+ k.EnableTensorOffset();
+ k.EnableTensorPitches();
+ k.EnableBiasPerFeature();
+ k.EnableNonBiasTerm();
+ k.EnableBatching();
+ k.EnableQuantization(QuantizationType::SYMMETRIC);
+ k.DisableTuning();
+ return k;
+}
+
+KernelsData Convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks::GetKernelsData(const Params& params,
+ const optional_params& options) const {
+ return GetCommonKernelsData(params, options);
+}
+
+JitConstants Convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks::GetJitConstants(const convolution_params& params,
+ const DispatchData& kd) const {
+ auto mem_consts = Parent::GetJitConstants(params, kd);
+ const auto& output = params.output;
+
+ mem_consts.AddConstants({MakeJitConstant("OUT_BLOCK_WIDTH", getOutBlock_X(output.X().v))});
+
+ if (!params.fused_ops.empty()) {
+ auto input_dt = GetActivationType(params);
+ FusedOpsConfiguration conf_scalar = {"",
+ {"out_b", "(out_f + get_sub_group_id() * 16)", "out_y", "out_x + i"},
+ "dequantized",
+ input_dt,
+ 1};
+ conf_scalar.SetLoopAxes({ Tensor::DataChannelName::X }, true);
+ mem_consts.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
+ }
+
+ return mem_consts;
+} // GetJitConstants
+
+ConvolutionKernelBase::DispatchData Convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks::SetDefault(
+ const convolution_params& params,
+ int) const {
+ DispatchData kd;
+ const auto& output = params.output;
+
+ auto output_block_width = getOutBlock_X(output.X().v);
+ kd.gws0 = output.X().v / output_block_width;
+ kd.gws1 = output.Y().v;
+ kd.gws2 = output.Batch().v * output.Feature().v * 2;
+
+ kd.lws0 = 1;
+ kd.lws1 = 1;
+ kd.lws2 = SIMD_SIZE * 4;
+
+ kd.cldnnStyle = {0, 0, 0, 0, 0};
+ kd.gemmStyle = {0, 0, 0, 0, 0, 0};
+
+ kd.efficiency = FORCE_PRIORITY_1;
+
+ return kd;
+} // SetDefault
+
+bool Convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks::Validate(const Params& params, const optional_params& options) const {
+ if (!Parent::Validate(params, options)) {
+ return false;
+ }
+
+ KernelData kd = KernelData::Default<convolution_params>(params);
+ convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
+
+ if (newParams.output.Feature().v % (2 * SIMD_SIZE) != 0) {
+ return false;
+ }
+
+ if ((newParams.filterSize.x != newParams.filterSize.y) ||
+ newParams.filterSize.x != 3) {
+ // Fitler size needs to be 3x3
+ return false;
+ }
+
+ if ((newParams.stride.x != newParams.stride.y) ||
+ (newParams.stride.x != 1 && newParams.stride.x != 2)) {
+ // Strides must be 1x1 or 2x2
+ return false;
+ }
+
+ if (newParams.output.X().v % 8 != 0 && newParams.output.X().v % 7 != 0) {
+ return false;
+ }
+
+ if (CeilDiv(newParams.inputs[0].Feature().v, 16) % 4 != 0) {
+ return false;
+ }
+
+ const auto& output = newParams.output;
+ auto output_block_width = getOutBlock_X(output.X().v);
+ size_t eu_count = params.engineInfo.computeUnitsCount;
+ auto global_size =
+ (output.X().v / output_block_width) * output.Y().v * ((output.Batch().v * output.Feature().v));
+ if ((global_size / 16) > (eu_count * 7)) {
+ return false;
+ }
+
+ if (newParams.groups != 1 || newParams.split != 1)
+ return false;
+
+ return true;
+}
+} // namespace kernel_selector
--- /dev/null
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include "convolution_kernel_base.h"
+#include <vector>
+
+namespace kernel_selector {
+
+class Convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks : public ConvolutionKernelBase {
+public:
+ using Parent = ConvolutionKernelBase;
+ Convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks() : ConvolutionKernelBase("convolution_gpu_b_fs_yx_fsv16_imad_3x3_ks") {}
+ virtual ~Convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks() {}
+
+ KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+ ParamsKey GetSupportedKey() const override;
+
+protected:
+ bool Validate(const Params& params, const optional_params& options) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
+ bool NeedPaddedInput() const override { return true; }
+ WeightsLayout GetPreferredWeightsLayout(const convolution_params&) const override {
+ return WeightsLayout::os_is_yx_osv16_isv16;
+ }
+
+ std::vector<FusedOpType> GetSupportedFusedOps() const override {
+ return { FusedOpType::ELTWISE,
+ FusedOpType::QUANTIZE,
+ FusedOpType::SCALE,
+ FusedOpType::ACTIVATION };
+ }
+};
+} // namespace kernel_selector
--- /dev/null
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "convolution_kernel_b_fs_yx_fsv_16_32_imad_dw.hpp"
+
+#include <vector>
+#include <string>
+#include <algorithm>
+
+namespace kernel_selector {
+
+ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw()
+ : ConvolutionKernelBase("convolution_gpu_b_fs_yx_fsv_16_32_imad_dw") {
+ std::vector<size_t> simd_sizes = { 8, 16 };
+ std::vector<size_t> tile_x_sizes = { 1, 2, 3, 4, 5, 7, 8, 11, 16, 24, 32 };
+ std::vector<std::string> exe_modes = ConvolutionKernelBase::autoTuneOptions;
+
+ constexpr size_t max_block_size = 32 * 8;
+
+ for (auto simd : simd_sizes) {
+ for (size_t tile_x = 1; tile_x <= 32; ++tile_x) {
+ if (simd * tile_x > max_block_size)
+ continue;
+ for (auto exe_mode : exe_modes) {
+ all_tune_params.push_back(AutoTuneParams{ simd, tile_x, exe_mode });
+ }
+ }
+ }
+}
+
+ParamsKey kernel_selector::ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::GetSupportedKey() const {
+ ParamsKey k;
+ k.EnableInputDataType(Datatype::INT8);
+ k.EnableInputDataType(Datatype::UINT8);
+ k.EnableOutputDataType(Datatype::INT8);
+ k.EnableOutputDataType(Datatype::UINT8);
+ k.EnableOutputDataType(Datatype::F16);
+ k.EnableOutputDataType(Datatype::F32);
+ k.EnableInputWeightsType(WeightsType::INT8);
+ k.EnableInputWeightsType(WeightsType::UINT8);
+ k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
+ k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
+ k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
+ k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
+ k.EnableDifferentTypes();
+ k.EnableDifferentInputWeightsTypes();
+ k.EnableTensorOffset();
+ k.EnableTensorPitches();
+ k.EnableBiasPerFeature();
+ k.EnableNonBiasTerm();
+ k.EnableBatching();
+ k.EnableQuantization(QuantizationType::SYMMETRIC);
+ k.EnableQuantization(QuantizationType::ASYMMETRIC_WEIGHTS);
+ k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA);
+ k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA_AND_WEIGHTS);
+ k.EnableDepthwiseSeparableOpt();
+ k.EnableGroupedConvolution();
+ return k;
+}
+
+bool ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::Validate(const Params& params, const optional_params& options) const {
+ if (!Parent::Validate(params, options))
+ return false;
+
+ auto conv_params = static_cast<const convolution_params&>(params);
+
+ if (conv_params.inputs[0].GetLayout() != conv_params.output.GetLayout())
+ return false;
+
+ if (conv_params.groups != conv_params.output.Feature().v || conv_params.groups != conv_params.inputs[0].Feature().v)
+ return false;
+
+ // Additional checks for asymmetric data
+ if (conv_params.quantization == QuantizationType::ASYMMETRIC_DATA ||
+ conv_params.quantization == QuantizationType::ASYMMETRIC_DATA_AND_WEIGHTS) {
+ // Needs compensation optimization
+ if (conv_params.compensation.empty())
+ return false;
+ // Padding not supported
+ const auto inputLimitX = (conv_params.output.X().v - 1) * conv_params.stride.x
+ + (conv_params.filterSize.x - 1) * conv_params.dilation.x + 1;
+ const auto inputLimitY = (conv_params.output.Y().v - 1) * conv_params.stride.y
+ + (conv_params.filterSize.y - 1) * conv_params.dilation.y + 1;
+ const auto inputLimitZ = (conv_params.output.Z().v - 1) * conv_params.stride.z
+ + (conv_params.filterSize.z - 1) * conv_params.dilation.z + 1;
+
+ bool needs_pad = false;
+ needs_pad |= conv_params.padding.x != 0;
+ needs_pad |= conv_params.padding.y != 0;
+ needs_pad |= conv_params.padding.z != 0;
+ needs_pad |= inputLimitX > conv_params.output.X().v;
+ needs_pad |= inputLimitY > conv_params.output.Y().v;
+ needs_pad |= inputLimitZ > conv_params.output.Z().v;
+
+ if (needs_pad)
+ return false;
+ }
+
+ return true;
+}
+
+WeightsLayout ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::GetPreferredWeightsLayout(const convolution_params& params) const {
+ if (params.output.GetLayout() == DataLayout::b_fs_yx_fsv16)
+ return WeightsLayout::gs_oi_yxs_gsv16_yxsv4;
+ else
+ return WeightsLayout::gs_oi_yxs_gsv32_yxsv4;
+}
+
+ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::AutoTuneParams
+ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::GetAutoTuneParams(const convolution_params& params, int index) const {
+ if (index >= 0 && index < static_cast<int>(all_tune_params.size())) {
+ return all_tune_params[index];
+ }
+ AutoTuneParams tune_params;
+ tune_params.simd = 16;
+ if (params.output.GetLayout() == DataLayout::b_fs_yx_fsv16) {
+ tune_params.tile_x = std::min((size_t)16, params.output.X().v);
+ } else {
+ tune_params.tile_x = std::min((size_t)8, params.output.X().v);
+ }
+
+ if (params.output.X().v < 3 * tune_params.tile_x && params.output.X().v % tune_params.tile_x != 0) {
+ tune_params.tile_x = tune_params.tile_x / 2;
+ }
+
+ return tune_params;
+}
+
+ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::DispatchData
+ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::SetDefault(const convolution_params& params, int autoTuneIndex) const {
+ DispatchData kd;
+ auto& out = params.output;
+
+ auto tune_params = GetAutoTuneParams(params, autoTuneIndex);
+
+ size_t fsv = 1;
+ if (out.GetLayout() == DataLayout::b_fs_yx_fsv16) {
+ fsv = 16;
+ } else if (out.GetLayout() == DataLayout::b_fs_yx_fsv32) {
+ fsv = 32;
+ }
+
+ std::vector<size_t> global = {
+ CeilDiv(out.X().v, tune_params.tile_x),
+ out.Y().v,
+ CeilDiv(out.Feature().v, fsv) * tune_params.simd * out.Batch().v
+ };
+ std::vector<size_t> local = { 1, 1, tune_params.simd };
+
+ kd.gws0 = global[0];
+ kd.gws1 = global[1];
+ kd.gws2 = global[2];
+
+ kd.lws0 = local[0];
+ kd.lws1 = local[1];
+ kd.lws2 = local[2];
+
+ kd.gemmStyle = { 0, 0, 0, 0, 0, 0 };
+
+ kd.cldnnStyle.blockWidth = tune_params.tile_x;
+
+ kd.efficiency = params.stride.x == 1 ? FORCE_PRIORITY_1 : FORCE_PRIORITY_2;
+
+ return kd;
+}
+
+JitConstants ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::GetJitConstants(const convolution_params& params, const DispatchData& kd) const {
+ auto mem_consts = Parent::GetJitConstants(params, kd);
+
+ constexpr size_t imad_width = 4;
+ auto filter_spatial = params.weights.X().v * params.weights.Y().v;
+ auto filter_blocked = filter_spatial / imad_width * imad_width;
+
+ mem_consts.AddConstant(MakeJitConstant("LWS0", kd.lws0));
+ mem_consts.AddConstant(MakeJitConstant("LWS1", kd.lws1));
+ mem_consts.AddConstant(MakeJitConstant("SIMD", kd.lws2));
+
+ mem_consts.AddConstant(MakeJitConstant("TILE_X", kd.cldnnStyle.blockWidth));
+ mem_consts.AddConstant(MakeJitConstant("FILTER_BLOCKED", filter_blocked));
+
+ if (!params.fused_ops.empty()) {
+ auto input_dt = GetActivationType(params);
+ auto conf_1 = FusedOpsConfiguration("_1",
+ { "b", "fused_ops_f", "y", "fused_ops_x" },
+ "fused_ops_in",
+ input_dt,
+ 1,
+ LoadType::LT_ALIGNED_READ,
+ BoundaryCheck::ENABLED,
+ IndexType::TENSOR_COORD,
+ Tensor::DataChannelName::FEATURE);
+ auto conf_2 = conf_1;
+ conf_2.suffix = "_2";
+ conf_2.vec_size = 2;
+ auto conf_4 = conf_1;
+ conf_4.suffix = "_4";
+ conf_4.vec_size = 4;
+ mem_consts.Merge(MakeFusedOpsJitConstants(params, { conf_1, conf_2, conf_4 }));
+ }
+
+ return mem_consts;
+}
+
+KernelsData ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::GetTunedKernelsDataByIndex(const Params& params,
+ const optional_params& options,
+ int autoTuneIndex) const {
+ auto convParams = static_cast<const convolution_params&>(params);
+ auto tuneParams = GetAutoTuneParams(convParams, autoTuneIndex);
+ return GetCommonKernelsData(params, options, tuneParams.exeMode, autoTuneIndex);
+}
+
+KernelsData ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::GetKernelsData(const Params& params, const optional_params& options) const {
+ return GetTunedKernelsDataByIndex(params, options);
+}
+
+KernelsData ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::GetKernelsDataForAutoTune(const Params& params,
+ const optional_params& options) const {
+ if (!Validate(params, options)) {
+ return {};
+ }
+ auto& conv_params = static_cast<const convolution_params&>(params);
+
+ KernelsData res = {};
+
+ for (size_t i = 0; i < all_tune_params.size(); i++) {
+ auto tune_params = GetAutoTuneParams(conv_params, static_cast<int>(i));
+ KernelsData kd = GetTunedKernelsDataByIndex(params, options, static_cast<int>(i));
+ if (!kd.empty()) {
+ res.emplace_back(kd[0]);
+ }
+ }
+
+ return res;
+}
+
+} // namespace kernel_selector
--- /dev/null
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "convolution_kernel_base.h"
+#include <vector>
+#include <string>
+
+namespace kernel_selector {
+class ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw : public ConvolutionKernelBase {
+public:
+ using Parent = ConvolutionKernelBase;
+ ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw();
+ virtual ~ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw() {}
+
+ ParamsKey GetSupportedKey() const override;
+ KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+ KernelsData GetKernelsDataForAutoTune(const Params & params, const optional_params & options) const override;
+ KernelsData GetTunedKernelsDataByIndex(const Params & params, const optional_params & options, int autoTuneIndex = -1) const override;
+
+protected:
+ bool Validate(const Params& params, const optional_params& options) const override;
+ WeightsLayout GetPreferredWeightsLayout(const convolution_params& params) const override;
+ std::vector<FusedOpType> GetSupportedFusedOps() const override {
+ return { FusedOpType::ELTWISE,
+ FusedOpType::QUANTIZE,
+ FusedOpType::SCALE,
+ FusedOpType::ACTIVATION };
+ }
+
+ bool NeedPaddedInput() const override { return true; }
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
+
+ struct AutoTuneParams {
+ size_t simd;
+ size_t tile_x;
+ std::string exeMode;
+ };
+ std::vector<AutoTuneParams> all_tune_params;
+
+ AutoTuneParams GetAutoTuneParams(const convolution_params& params, int index) const;
+};
+} // namespace kernel_selector
if (NeedPaddedInput()) {
kd.reorderInput = CovolutionUpdateInputParams(newParams);
+
+ if (kd.reorderInput && !options.allowInputReordering)
+ return {};
}
DispatchData runInfo = SetDefault(newParams, autoTuneIndex);
--- /dev/null
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "convolution_kernel_bfyx_iyxo.h"
+#include <vector>
+#include <utility>
+#include <algorithm>
+
+namespace kernel_selector {
+// Sub-group size used by "convolution_kernel_bfyx_iyxo" kernel.
+constexpr size_t sub_group_size = 16;
+
+ParamsKey ConvolutionKernel_bfyx_iyxo::GetSupportedKey() const {
+ ParamsKey k;
+ k.EnableInputDataType(Datatype::F16);
+ k.EnableInputWeightsType(WeightsType::F16);
+ k.EnableOutputDataType(Datatype::F16);
+ k.EnableInputLayout(DataLayout::bfyx);
+ k.EnableOutputLayout(DataLayout::bfyx);
+ k.EnableTensorOffset();
+ k.EnableTensorPitches();
+ k.EnableSubGroup();
+ k.EnableBiasPerFeature();
+ k.EnableNonBiasTerm();
+ k.EnableBatching();
+ return k;
+}
+
+ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_iyxo::SetDefault(const convolution_params& cp, int) const {
+ DispatchData runInfo = ConvolutionKernelBase::SetDefault(cp);
+
+ runInfo.efficiency = FORCE_PRIORITY_9;
+
+ runInfo.gws0 = CeilDiv(cp.output.X().v, sub_group_size) / 4;
+ runInfo.gws1 = cp.output.Y().v;
+ runInfo.gws2 = sub_group_size;
+
+ runInfo.lws0 = 1;
+ runInfo.lws1 = 1;
+ runInfo.lws2 = sub_group_size;
+
+ return runInfo;
+}
+
+bool ConvolutionKernel_bfyx_iyxo::Validate(const Params& p, const optional_params& o) const {
+ if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
+ return false;
+ }
+
+ const auto& params = static_cast<const convolution_params&>(p);
+ if (params.inputs[0].X().v % 64)
+ return false;
+
+ bool bFilterSize = (params.filterSize.x == 5 && params.filterSize.y == 5) ||
+ (params.filterSize.x == 3 && params.filterSize.y == 3 && (params.inputs[0].Feature().v % 4) == 0) ||
+ (params.filterSize.x == 1 && params.filterSize.y == 1);
+
+ bool bStride = (params.stride.x == 1 && params.stride.y == 1);
+
+ if (!bFilterSize || !bStride || (params.output.Feature().v % 4) != 0 || (params.output.Batch().v != 1)) {
+ return false;
+ }
+
+ return true;
+}
+
+JitConstants ConvolutionKernel_bfyx_iyxo::GetJitConstants(const convolution_params& params, const DispatchData& runInfo) const {
+ auto jit = Parent::GetJitConstants(params, runInfo);
+
+ jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2));
+
+ return jit;
+}
+
+KernelsData ConvolutionKernel_bfyx_iyxo::GetKernelsData(const Params& params, const optional_params& options) const {
+ return GetTunedKernelsDataByIndex(params, options);
+}
+
+} // namespace kernel_selector
--- /dev/null
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#pragma once
+
+#include "convolution_kernel_base.h"
+#include <string>
+
+namespace kernel_selector {
+
+class ConvolutionKernel_bfyx_iyxo : public ConvolutionKernelBase {
+public:
+ using Parent = ConvolutionKernelBase;
+ ConvolutionKernel_bfyx_iyxo() : Parent("convolution_gpu_bfyx_iyxo") {}
+ virtual ~ConvolutionKernel_bfyx_iyxo() {}
+
+ KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+ ParamsKey GetSupportedKey() const override;
+
+protected:
+ WeightsLayout GetPreferredWeightsLayout(const convolution_params&) const override {
+ return WeightsLayout::iyxo;
+ }
+
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ bool Validate(const Params& p, const optional_params& o) const override;
+ bool NeedPaddedInput() const override { return true; }
+ DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
+};
+} // namespace kernel_selector
//
#define SIMD_SIZE 16
-static bool getOutBlock_WH(size_t output_size,
+static void getOutBlock_WH(size_t output_size,
size_t stride,
size_t kernel_size,
+ size_t dilation,
size_t& output_block_w,
size_t& output_block_h) {
- bool verify_output_ranges = false;
-
output_block_w = output_block_h = 0;
size_t upper_border = output_size < SIMD_SIZE ? output_size : SIMD_SIZE;
- size_t stride_restrictions = (SIMD_SIZE - (kernel_size - 1)) / stride;
+ size_t stride_restrictions = (SIMD_SIZE - (kernel_size - 1) * dilation - 1) / stride + 1;
size_t max_posible_tile_size = upper_border < stride_restrictions ? upper_border : stride_restrictions;
size_t block_size = 0;
- for (size_t i = min_horisontal_block_size; i < max_posible_tile_size; i++) {
+ for (size_t i = min_horisontal_block_size; i <= max_posible_tile_size; i++) {
if (output_size % i == 0)
block_size = i;
}
output_block_w = block_size;
} else {
output_block_w = max_posible_tile_size;
- verify_output_ranges = true;
}
}
output_block_h = output_block_w;
else
output_block_h = 1;
-
- return verify_output_ranges;
}
namespace kernel_selector {
ParamsKey k;
k.EnableInputDataType(Datatype::INT8);
k.EnableInputDataType(Datatype::UINT8);
+
k.EnableOutputDataType(Datatype::INT8);
k.EnableOutputDataType(Datatype::UINT8);
k.EnableOutputDataType(Datatype::F32);
+
k.EnableInputWeightsType(WeightsType::INT8);
k.EnableInputWeightsType(WeightsType::UINT8);
+
+ k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
+
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableOutputLayout(DataLayout::byxf_af32);
+ k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
+
k.EnableDifferentTypes();
k.EnableDifferentInputWeightsTypes();
k.EnableTensorOffset();
k.EnableTensorPitches();
-// k.EnableDilation();
+ k.EnableDilation();
k.EnableBiasPerFeature();
+ k.EnableGroupedConvolution();
k.EnableNonBiasTerm();
k.EnableBatching();
k.EnableQuantization(QuantizationType::SYMMETRIC);
const auto& input = params.inputs[0];
const auto& output = params.output;
-
- const auto& iDims = input.GetDims();
- const auto& oDims = output.GetDims();
const auto& weights = params.weights;
- const auto& wDims = weights.GetDims();
- const int iX = DataTensor::Channelndex(input.GetLayout(), Tensor::DataChannelName::X);
- const int iY = DataTensor::Channelndex(input.GetLayout(), Tensor::DataChannelName::Y);
- const int iF = DataTensor::Channelndex(input.GetLayout(), Tensor::DataChannelName::FEATURE);
- const int wOD = WeightsTensor::Channelndex(weights.GetLayout(), Tensor::WeightsChannelName::OFM);
- const int oX = DataTensor::Channelndex(output.GetLayout(), Tensor::DataChannelName::X);
- const int oY = DataTensor::Channelndex(output.GetLayout(), Tensor::DataChannelName::Y);
+
+ size_t in_fsv = 4;
+ if (params.inputs[0].GetLayout() == DataLayout::b_fs_yx_fsv4)
+ in_fsv = 4;
+ else if (params.inputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16)
+ in_fsv = 16;
+ else if (params.inputs[0].GetLayout() == DataLayout::byxf_af32)
+ in_fsv = 32;
mem_consts.AddConstants({
- MakeJitConstant("_IW", iDims[iX].v),
- MakeJitConstant("_IH", iDims[iY].v),
- MakeJitConstant("_ID", RoundUp(iDims[iF].v, 4)),
- MakeJitConstant("IWPAD", iDims[iX].pad.before + iDims[iX].pad.after),
- MakeJitConstant("IHPAD", iDims[iY].pad.before + iDims[iY].pad.after),
- MakeJitConstant("_OW", oDims[oX].v),
- MakeJitConstant("_OH", oDims[oY].v),
- MakeJitConstant("_OD", wDims[wOD].v),
- MakeJitConstant("OWPAD", oDims[oX].pad.before + oDims[oX].pad.after),
- MakeJitConstant("OHPAD", oDims[oY].pad.before + oDims[oY].pad.after),
+ MakeJitConstant("_ID", RoundUp(input.Feature().v, in_fsv)),
+ MakeJitConstant("IWPAD", input.X().pad.Total()),
+ MakeJitConstant("IHPAD", input.Y().pad.Total()),
+ MakeJitConstant("_OD", Align(output.Feature().v, SIMD_SIZE)),
+ MakeJitConstant("OWPAD", output.X().pad.Total()),
+ MakeJitConstant("OHPAD", output.Y().pad.Total()),
MakeJitConstant("SIMD_SIZE", SIMD_SIZE),
- MakeJitConstant("K_HEIGHT", wDims[iY].v),
- MakeJitConstant("K_WIDTH", wDims[iX].v),
- MakeJitConstant("K_STRIDE", params.stride.x), // X and Y must be equal
});
if (params.filterSize.x != 3 || params.filterSize.y != 3) {
mem_consts.Merge(MakeTypeJitConstants(GetPackedInputType(params), "PACKED"));
size_t obw, obh;
- bool verify_output_ranges = getOutBlock_WH(oDims[oX].v, params.stride.x, wDims[iX].v, obw, obh);
+ getOutBlock_WH(output.X().v, params.stride.x, weights.X().v, params.dilation.x, obw, obh);
mem_consts.AddConstants({MakeJitConstant("OUT_BLOCK_WIDTH", obw),
- MakeJitConstant("OUT_BLOCK_HEIGHT", obh),
- MakeJitConstant("NEED_TO_VERIFY_OUTPUT_RANGES", verify_output_ranges)});
+ MakeJitConstant("OUT_BLOCK_HEIGHT", obh)});
if (!params.fused_ops.empty()) {
auto input_dt = GetActivationType(params);
} // GetJitConstants
ConvolutionKernelBase::DispatchData ConvolutionKernel_imad::SetDefault(const convolution_params& params,
- int) const {
+ int) const {
DispatchData kd;
- const auto& in = params.inputs[0];
const auto& output = params.output;
const auto& weights = params.weights;
- const auto& iDims = in.GetDims();
- const auto& oDims = output.GetDims();
- const auto& wDims = weights.GetDims();
- const int oX = DataTensor::Channelndex(output.GetLayout(), Tensor::DataChannelName::X);
- const int oY = DataTensor::Channelndex(output.GetLayout(), Tensor::DataChannelName::Y);
- const int oB = DataTensor::Channelndex(output.GetLayout(), Tensor::DataChannelName::BATCH);
- const int wOD = WeightsTensor::Channelndex(weights.GetLayout(), Tensor::WeightsChannelName::OFM);
- const int wX = WeightsTensor::Channelndex(weights.GetLayout(), Tensor::WeightsChannelName::X);
size_t otw, oth;
- getOutBlock_WH(oDims[oX].v, params.stride.x, wDims[wX].v, otw, oth);
+ getOutBlock_WH(output.X().v, params.stride.x, weights.X().v, params.dilation.x, otw, oth);
- size_t dim_add = ((wDims[wOD].v * iDims[oB].v) % SIMD_SIZE);
- if (dim_add != 0)
- dim_add = SIMD_SIZE - dim_add;
+ std::vector<size_t> global = {// number of tiles needed to cover output width
+ CeilDiv(output.X().v, otw),
- std::vector<size_t> global = {// globalRange[0] = ((_IW / K_STRIDE) + (OTW - 1)) / OTW;
- // number of tiles needed to cover output width
- CeilDiv(oDims[oX].v, otw),
-
- // globalRange[1] = ((_IH / K_STRIDE) + (OTH - 1)) / OTH;
// number of tiles needed to cover output height
- CeilDiv(oDims[oY].v, oth),
+ CeilDiv(output.Y().v, oth),
- // globalRange[2] = (_OD * _B) + ((_B *_OD) % __WORKGROUP_SIZE);
// round depth range up
- ((wDims[wOD].v * iDims[oB].v) + dim_add)};
+ Align(weights.OFM().v, SIMD_SIZE) * params.groups * output.Batch().v};
std::vector<size_t> local = {1, 1, SIMD_SIZE};
// This kernel is quite slow for 1x1 and KHx1 kernels
// TODO: check if we need any optimized kernels in this layout
// If yes, we need to implement some customization for these cases.
- kd.efficiency = FORCE_PRIORITY_2;
+ kd.efficiency = FORCE_PRIORITY_3;
return kd;
} // SetDefault
return false;
}
- KernelData kd = KernelData::Default<convolution_params>(params);
- convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
-
- if (newParams.stride.x != newParams.stride.y) {
- // Strides must be equal
+ auto& newParams = static_cast<const convolution_params&>(params);
+ if ((newParams.inputs[0].Feature().v / newParams.groups) % 4 != 0)
return false;
- }
- if (newParams.output.X().v != newParams.output.Y().v) {
- // W and H must be equal
+
+ size_t min_block_size_x = (newParams.weights.X().v - 1) * newParams.dilation.x + 1;
+ if (min_block_size_x > SIMD_SIZE)
return false;
- }
return true;
}
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
bool NeedPaddedInput() const override { return true; }
- WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
- return WeightsLayout::os_is_yx_osv16_isv4;
+ WeightsLayout GetPreferredWeightsLayout(const convolution_params &p) const override {
+ return p.groups > 1 ? WeightsLayout::g_os_is_yx_osv16_isv4 : WeightsLayout::os_is_yx_osv16_isv4;
}
std::vector<FusedOpType> GetSupportedFusedOps() const override {
#include "convolution_kernel_bfyx_direct_10_12_16.h"
#include "convolution_kernel_bfyx_os_iyx_osv16.h"
#include "convolution_kernel_bfyx_os_iyx_osv16_2_sg.h"
+#include "convolution_kernel_bfyx_iyxo.h"
#include "convolution_kernel_yxfb_ref.h"
#include "convolution_kernel_yxfb_yxio_b16.h"
#include "convolution_kernel_yxfb_yxio_b8.h"
#include "convolution_kernel_mmad_b_fs_yx_fsv32_dw.h"
#include "convolution_kernel_mmad_bfyx_b_fs_yx_fsv32.h"
#include "convolution_kernel_bfyx_to_bs_fs_yx_bsv16_fsv16.h"
+#include "convolution_kernel_b_fs_yx_fsv16_imad_1x1.h"
+#include "convolution_kernel_b_fs_yx_fsv16_imad_3x3.h"
+#include "convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks.h"
+#include "convolution_kernel_b_fs_yx_fsv_16_32_imad_dw.hpp"
namespace kernel_selector {
convolution_kernel_selector::convolution_kernel_selector() {
Attach<ConvolutionKernel_Ref>();
Attach<DeformableConvolutionKernel_bfyx_Ref>();
+ // b_fs_yx_fsv16 int8
+ Attach<Convolution_kernel_b_fs_yx_fsv16_imad_1x1>();
+ Attach<Convolution_kernel_b_fs_yx_fsv16_imad_3x3>();
+ Attach<Convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks>();
+
// b_fs_yx_fsv16 and b_fs_zyx_fsv16
Attach<ConvolutionKernel_b_fs_yx_fsv16_depthwise>();
Attach<ConvolutionKernel_b_fs_yx_fsv16_1x1>();
Attach<ConvolutionKernel_bfyx_GEMMLike>();
Attach<ConvolutionKernel_bfyx_Direct_10_10_12>();
Attach<ConvolutionKernel_bfyx_os_iyx_osv16>();
+ Attach<ConvolutionKernel_bfyx_iyxo>();
Attach<ConvolutionKernel_bfyx_1x1>();
Attach<ConvolutionKernel_bfyx_1x1_gemm_buf>();
Attach<ConvolutionKernel_bfyx_depthwise_weights_lwg>();
Attach<ConvolutionKernel_mmad_b_fs_yx_fsv32>();
Attach<ConvolutionKernel_mmad_b_fs_yx_fsv32_dw>();
Attach<ConvolutionKernel_mmad_bfyx_b_fs_yx_fsv32>();
+ Attach<ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw>();
}
KernelsData convolution_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
--- /dev/null
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "depth_to_space_kernel_base.h"
+#include "kernel_selector_utils.h"
+#include <string>
+#include <vector>
+
+namespace kernel_selector {
+
+bool DepthToSpaceKernelBase::Validate(const Params& p, const optional_params& o) const {
+ if (p.GetType() != KernelType::DEPTH_TO_SPACE ||
+ o.GetType() != KernelType::DEPTH_TO_SPACE) {
+ return false;
+ }
+
+ return true;
+}
+
+CommonDispatchData DepthToSpaceKernelBase::SetDefault(const depth_to_space_params& params) const {
+ CommonDispatchData runInfo;
+
+ std::vector<size_t> global = { params.output.Batch().v,
+ params.output.Feature().v,
+ params.output.Y().v * params.output.X().v };
+
+ auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+
+ runInfo.gws0 = global[0];
+ runInfo.gws1 = global[1];
+ runInfo.gws2 = global[2];
+
+ runInfo.lws0 = local[0];
+ runInfo.lws1 = local[1];
+ runInfo.lws2 = local[2];
+
+ return runInfo;
+}
+
+JitConstants DepthToSpaceKernelBase::GetJitConstants(const depth_to_space_params& params) const {
+ JitConstants jit = MakeBaseParamsJitConstants(params);
+
+ jit.AddConstant(MakeJitConstant("BLOCK_SIZE", params.block_size));
+
+ return jit;
+}
+
+KernelsData DepthToSpaceKernelBase::GetCommonKernelsData(const Params& params, const optional_params& options, float estimatedTime) const {
+ KernelData kd = KernelData::Default<depth_to_space_params>(params);
+ depth_to_space_params& newParams = *static_cast<depth_to_space_params*>(kd.params.get());
+
+ if (!Validate(params, options)) {
+ return {};
+ }
+
+ auto runInfo = SetDefault(newParams);
+ auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
+ auto cldnn_jit = GetJitConstants(newParams);
+ std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+
+ auto& kernel = kd.kernels[0];
+
+ FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point);
+
+ kd.estimatedTime = estimatedTime;
+
+ return { kd };
+}
+} // namespace kernel_selector
--- /dev/null
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include "common_kernel_base.h"
+#include "kernel_selector_params.h"
+
+namespace kernel_selector {
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// depth_to_space_params
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+struct depth_to_space_params : public base_params {
+ depth_to_space_params() : base_params(KernelType::DEPTH_TO_SPACE), block_size(0) {}
+ size_t block_size;
+
+ virtual ParamsKey GetParamsKey() const { return base_params::GetParamsKey(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// depth_to_space_optional_params
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+struct depth_to_space_optional_params : optional_params {
+ depth_to_space_optional_params() : optional_params(KernelType::DEPTH_TO_SPACE) {}
+};
+
+struct depth_to_space_fuse_params : fuse_params {
+ depth_to_space_fuse_params() : fuse_params(KernelType::DEPTH_TO_SPACE) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// DepthToSpaceKernelBase
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+class DepthToSpaceKernelBase : public common_kernel_base {
+public:
+ using common_kernel_base::common_kernel_base;
+ virtual ~DepthToSpaceKernelBase() {}
+
+ struct DispatchData : public CommonDispatchData {
+ };
+
+protected:
+ virtual bool Validate(const Params&, const optional_params&) const;
+ virtual JitConstants GetJitConstants(const depth_to_space_params& params) const;
+ virtual CommonDispatchData SetDefault(const depth_to_space_params& params) const;
+ KernelsData GetCommonKernelsData(const Params& params, const optional_params&, float estimatedTime) const;
+};
+} // namespace kernel_selector
--- /dev/null
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "depth_to_space_kernel_block2_opt.h"
+#include "kernel_selector_utils.h"
+#include <string>
+#include <vector>
+
+namespace kernel_selector {
+ParamsKey DepthToSpaceKernelBlock2Opt::GetSupportedKey() const {
+ ParamsKey k;
+ k.EnableInputDataType(Datatype::F16);
+ k.EnableOutputDataType(Datatype::F16);
+ k.EnableInputLayout(DataLayout::bfyx);
+ k.EnableOutputLayout(DataLayout::bfyx);
+ return k;
+}
+
+bool DepthToSpaceKernelBlock2Opt::Validate(const Params& p, const optional_params& o) const {
+ if (!DepthToSpaceKernelBase::Validate(p, o))
+ return false;
+
+ const auto& params = static_cast<const depth_to_space_params&>(p);
+
+ if ((params.block_size != 2) || (params.inputs[0].X().v % 2 != 0))
+ return false;
+
+ return true;
+}
+
+CommonDispatchData DepthToSpaceKernelBlock2Opt::SetDefault(const depth_to_space_params& params) const {
+ CommonDispatchData runInfo;
+
+ std::vector<size_t> global = { Align(params.inputs[0].X().v / 2, 16),
+ params.inputs[0].Y().v,
+ 1};
+
+ auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+
+ runInfo.gws0 = global[0];
+ runInfo.gws1 = global[1];
+ runInfo.gws2 = global[2];
+
+ runInfo.lws0 = local[0];
+ runInfo.lws1 = local[1];
+ runInfo.lws2 = local[2];
+
+ return runInfo;
+}
+
+JitConstants DepthToSpaceKernelBlock2Opt::GetJitConstants(const depth_to_space_params& params) const {
+ JitConstants jit = Parent::GetJitConstants(params);
+
+ jit.AddConstant(MakeJitConstant("IN_WIDTH", params.inputs[0].X().v / 2));
+
+ return jit;
+}
+
+KernelsData DepthToSpaceKernelBlock2Opt::GetKernelsData(const Params& params, const optional_params& options) const {
+ return GetCommonKernelsData(params, options, FORCE_PRIORITY_5);
+}
+} // namespace kernel_selector
--- /dev/null
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include "depth_to_space_kernel_base.h"
+
+namespace kernel_selector {
+class DepthToSpaceKernelBlock2Opt : public DepthToSpaceKernelBase {
+public:
+ using Parent = DepthToSpaceKernelBase;
+
+ DepthToSpaceKernelBlock2Opt() : DepthToSpaceKernelBase("depth_to_space_block2_opt") {}
+ virtual ~DepthToSpaceKernelBlock2Opt() {}
+
+ bool Validate(const Params&, const optional_params&) const override;
+ JitConstants GetJitConstants(const depth_to_space_params& params) const override;
+ CommonDispatchData SetDefault(const depth_to_space_params& params) const override;
+ KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+ ParamsKey GetSupportedKey() const override;
+};
+} // namespace kernel_selector
/*
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
return k;
}
-CommonDispatchData DepthToSpaceKernelRef::SetDefault(const depth_to_space_params& params,
- const optional_params&) const {
- CommonDispatchData runInfo;
-
- std::vector<size_t> global = {params.output.Batch().v,
- params.output.Feature().v,
- params.output.Y().v * params.output.X().v};
-
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- runInfo.gws0 = global[0];
- runInfo.gws1 = global[1];
- runInfo.gws2 = global[2];
-
- runInfo.lws0 = local[0];
- runInfo.lws1 = local[1];
- runInfo.lws2 = local[2];
-
- return runInfo;
-}
-
-JitConstants DepthToSpaceKernelRef::GetJitConstants(const depth_to_space_params& params) const {
- JitConstants jit = MakeBaseParamsJitConstants(params);
-
- jit.AddConstant(MakeJitConstant("BLOCK_SIZE", params.block_size));
-
- return jit;
-}
-
KernelsData DepthToSpaceKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
- KernelData kd = KernelData::Default<depth_to_space_params>(params);
- depth_to_space_params& newParams = *static_cast<depth_to_space_params*>(kd.params.get());
-
- assert(params.GetType() == KernelType::DEPTH_TO_SPACE);
-
- auto runInfo = SetDefault(newParams, options);
- auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
- auto cldnn_jit = GetJitConstants(newParams);
- std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
-
- auto& kernel = kd.kernels[0];
-
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point);
-
- kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE;
-
- return {kd};
+ return GetCommonKernelsData(params, options, FORCE_PRIORITY_9);
}
} // namespace kernel_selector
/*
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
#pragma once
-#include "common_kernel_base.h"
+#include "depth_to_space_kernel_base.h"
namespace kernel_selector {
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// depth_to_space_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct depth_to_space_params : public base_params {
- depth_to_space_params() : base_params(KernelType::DEPTH_TO_SPACE), block_size(0) {}
-
- size_t block_size;
-
- virtual ParamsKey GetParamsKey() const { return base_params::GetParamsKey(); }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// depth_to_space_optional_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct depth_to_space_optional_params : optional_params {
- depth_to_space_optional_params() : optional_params(KernelType::DEPTH_TO_SPACE) {}
-};
-
-class DepthToSpaceKernelRef : public common_kernel_base {
+class DepthToSpaceKernelRef : public DepthToSpaceKernelBase {
public:
- DepthToSpaceKernelRef() : common_kernel_base("depth_to_space_ref") {}
+ DepthToSpaceKernelRef() : DepthToSpaceKernelBase("depth_to_space_ref") {}
virtual ~DepthToSpaceKernelRef() {}
- virtual JitConstants GetJitConstants(const depth_to_space_params& params) const;
- virtual CommonDispatchData SetDefault(const depth_to_space_params& params, const optional_params&) const;
+
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;
};
/*
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
#include "depth_to_space_kernel_selector.h"
#include "depth_to_space_kernel_ref.h"
+#include "depth_to_space_kernel_block2_opt.h"
namespace kernel_selector {
-depth_to_space_kernel_selector::depth_to_space_kernel_selector() { Attach<DepthToSpaceKernelRef>(); }
+depth_to_space_kernel_selector::depth_to_space_kernel_selector() {
+ Attach<DepthToSpaceKernelRef>();
+ Attach<DepthToSpaceKernelBlock2Opt>();
+}
KernelsData depth_to_space_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
return GetNaiveBestKernel(params, options, KernelType::DEPTH_TO_SPACE);
for (size_t i = 0; i < ewParams.inputs.size(); i++) {
// Allow the same input sizes OR per-channel operation
if ((ewParams.inputs[i].LogicalSize() != output.LogicalSize()) &&
- (ewParams.inputs[i].LogicalSize() != output.Feature().v) &&
+ (ewParams.inputs[i].LogicalSize() != output.Feature().v || ewParams.inputs[i].Feature().v != output.Feature().v) &&
(ewParams.inputs[i].LogicalSize() != 1))
return false;
}
ParamsKey k;
k.EnableInputDataType(Datatype::INT8);
k.EnableInputDataType(Datatype::UINT8);
+
k.EnableOutputDataType(Datatype::INT8);
k.EnableOutputDataType(Datatype::UINT8);
k.EnableOutputDataType(Datatype::F32);
+
k.EnableInputWeightsType(WeightsType::INT8);
- k.EnableDifferentInputWeightsTypes();
- k.EnableDifferentTypes();
+
k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
+ k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
+
k.EnableOutputLayout(DataLayout::bf);
+
+ k.EnableDifferentInputWeightsTypes();
+ k.EnableDifferentTypes();
k.EnableBiasPerOutput();
k.EnableBiasPerFeature();
k.EnableNonBiasTerm();
}
KernelsData FullyConnectedKernelIMAD::GetKernelsData(const Params& params, const optional_params& options) const {
+ auto fc_params = static_cast<const fully_connected_params&>(params);
+ auto& input = fc_params.inputs[0];
+
KernelsData res = {};
for (size_t i = 0; i < autoTuneOptions.size(); i++) {
KernelsData kd = GetTunedKernelsDataByIndex(params,
options,
- DataLayout::b_fs_yx_fsv4,
+ input.GetLayout(),
WeightsLayout::os_is_yx_osv16_isv4,
FORCE_PRIORITY_1,
static_cast<int>(i));
k.EnableFusedConvEltwiseRWOutOpt();
}
+ if (depth_to_space_already_fused) {
+ k.EnableFusedConvEltwDepthToSpaceFusing();
+ }
+
return k;
}
}
static DataTensor GetConvolutionBFYXPaddedTensor(const fused_conv_eltwise_params& cp) {
- DataTensor t = cp.inputs[0];
+ DataTensor t;
+ if (cp.inputs.size() > 1 && (cp.inputs[0].X().v <= cp.inputs[1].X().v))
+ t = cp.inputs[1];
+ else
+ t = cp.inputs[0];
std::vector<Tensor::Pad> pad{{0, 0}, {0, 0}, {0, 0}, {0, 0}, { 0, 0 } };
auto& conv = cp.conv;
/*
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
float non_conv_scale = 1.0f;
bool second_input_in_output = false;
+ bool depth_to_space_already_fused = false;
std::string to_string() const override;
std::string to_cache_string_v2() const override;
--- /dev/null
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "fused_conv_eltwise_kernel_bfyx_iyxo.h"
+#include <vector>
+#include <utility>
+#include <algorithm>
+
+namespace kernel_selector {
+constexpr size_t sub_group_size = 16;
+
+fused_conv_eltwise_kernel_bfyx_iyxo::fused_conv_eltwise_kernel_bfyx_iyxo()
+ : fused_conv_eltwise_kernel_base("fused_conv_eltwise_gpu_bfyx_iyxo") {
+}
+
+ParamsKey fused_conv_eltwise_kernel_bfyx_iyxo::GetSupportedKey() const {
+ ParamsKey k;
+ k.EnableInputDataType(Datatype::F16);
+ k.EnableInputWeightsType(WeightsType::F16);
+ k.EnableOutputDataType(Datatype::F16);
+ k.EnableInputLayout(DataLayout::bfyx);
+ k.EnableOutputLayout(DataLayout::bfyx);
+ k.EnableOutputLayout(DataLayout::image_2d_rgba);
+ k.EnableOutputDataType(Datatype::UINT8);
+ k.EnableTensorOffset();
+ k.EnableTensorPitches();
+ k.EnableSubGroup();
+ k.EnableSubGroupShort();
+ k.EnableBiasPerFeature();
+ k.EnableBiasPerOutput();
+ k.EnableNonBiasTerm();
+ k.EnableBatching();
+ k.EnableDifferentTypes();
+ k.EnableFusedConvEltwSplitSupport();
+ k.EnableFusedConvEltwDilation();
+ k.EnableFusedConvEltwTranspose();
+ k.EnableFusedConvEltwiseRWOutOpt();
+ k.EnableFusedConvEltwDepthToSpaceFusing();
+ return k;
+}
+
+fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_bfyx_iyxo::SetDefault(
+ const fused_conv_eltwise_params& cp,
+ int) const {
+ DispatchData runInfo = fused_conv_eltwise_kernel_base::SetDefault(cp);
+
+ runInfo.efficiency = FORCE_PRIORITY_9;
+
+ runInfo.gws0 = CeilDiv(cp.output.X().v, sub_group_size) / 4 / 2;
+ runInfo.gws1 = cp.output.Y().v / 2;
+ runInfo.gws2 = sub_group_size;
+
+ runInfo.lws0 = 1;
+ runInfo.lws1 = 1;
+ runInfo.lws2 = sub_group_size;
+
+ return runInfo;
+}
+
+bool fused_conv_eltwise_kernel_bfyx_iyxo::Validate(const Params& p, const optional_params& o) const {
+ if (!fused_conv_eltwise_kernel_base::Validate(p, o) || !FusedConvolutionEltwiseCheckInput(p, o)) {
+ return false;
+ }
+
+ const auto& params = static_cast<const fused_conv_eltwise_params&>(p);
+ if (params.inputs[0].X().v % 128 || params.inputs[0].Y().v % 2)
+ return false;
+
+ return true;
+}
+
+JitConstants fused_conv_eltwise_kernel_bfyx_iyxo::GetJitConstants(const fused_conv_eltwise_params& params,
+ const DispatchData& runInfo) const {
+ auto jit = Parent::GetJitConstants(params, runInfo);
+ jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2));
+ return jit;
+}
+
+KernelsData fused_conv_eltwise_kernel_bfyx_iyxo::GetKernelsData(const Params& params,
+ const optional_params& options) const {
+ return GetTunedKernelsDataByIndex(params, options);
+}
+
+} // namespace kernel_selector
--- /dev/null
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#pragma once
+
+#include "fused_conv_eltwise_kernel_base.h"
+#include <string>
+#include <vector>
+
+namespace kernel_selector {
+
+class fused_conv_eltwise_kernel_bfyx_iyxo : public fused_conv_eltwise_kernel_base {
+public:
+ using Parent = fused_conv_eltwise_kernel_base;
+ fused_conv_eltwise_kernel_bfyx_iyxo();
+ virtual ~fused_conv_eltwise_kernel_bfyx_iyxo() {}
+
+ KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+ ParamsKey GetSupportedKey() const override;
+
+protected:
+ WeightsLayout GetPreferreddWeightsLayout(const fused_conv_eltwise_params&) const override {
+ return WeightsLayout::iyxo;
+ }
+ JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const override;
+ bool Validate(const Params& p, const optional_params& o) const override;
+ bool NeedPaddedInput() const override { return true; }
+ DispatchData SetDefault(const fused_conv_eltwise_params& arg, int autoTuneIndex = -1) const override;
+};
+} // namespace kernel_selector
-// Copyright (c) 2016-2018 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
#include "fused_conv_eltwise_kernel_yxfb_yxio_b16.h"
#include "fused_conv_eltwise_kernel_imad.h"
#include "fused_conv_eltwise_kernel_af32_imad_1x1.h"
+#include "fused_conv_eltwise_kernel_bfyx_iyxo.h"
namespace kernel_selector {
fused_conv_eltwise_kernel_selector::fused_conv_eltwise_kernel_selector() {
Attach<fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8>();
Attach<fused_conv_eltwise_kernel_imad>();
Attach<fused_conv_eltwise_kernel_af32_imad_1x1>();
+ Attach<fused_conv_eltwise_kernel_bfyx_iyxo>();
}
KernelsData fused_conv_eltwise_kernel_selector::GetBestKernels(const Params& params,
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
--- /dev/null
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "mvn_kernel_b_fs_yx_fsv16_imad.hpp"
+#include "common/common_tools.h"
+
+#include <string>
+#include <algorithm>
+#include <iostream>
+
+namespace kernel_selector {
+
+static constexpr size_t simd = 16;
+static constexpr size_t fsv = 16;
+static constexpr size_t pref_work_groups = 16;
+
+ParamsKey MVNKernel_b_fs_yx_fsv16_imad::GetSupportedKey() const {
+ ParamsKey k;
+ k.EnableInputDataType(Datatype::INT8);
+ k.EnableInputDataType(Datatype::UINT8);
+ k.EnableOutputDataType(Datatype::F16);
+ k.EnableOutputDataType(Datatype::F32);
+ k.EnableOutputDataType(Datatype::INT8);
+ k.EnableOutputDataType(Datatype::UINT8);
+ k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
+ k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
+ k.EnableTensorOffset();
+ k.EnableTensorPitches();
+ k.EnableDifferentTypes();
+ k.EnableBatching();
+ // TODO Add support for across channels
+ // k.EnableMVNMode(MVNMode::ACROSS_CHANNELS);
+ k.EnableMVNMode(MVNMode::WITHIN_CHANNELS);
+ k.EnableMVNNormalizeVariance();
+ return k;
+}
+
+bool MVNKernel_b_fs_yx_fsv16_imad::Validate(const Params& p, const optional_params& options) const {
+ if (!Parent::Validate(p, options))
+ return false;
+
+ auto params = static_cast<const mvn_params&>(p);
+
+ // TODO Add support for input padding via iterating over y (parallel or in kernel).
+ if (params.inputs[0].X().pad.Total() != 0 || params.inputs[0].Y().pad.Total() != 0)
+ return false;
+
+ return true;
+}
+
+MVNKernelBase::DispatchData MVNKernel_b_fs_yx_fsv16_imad::SetDefault(const mvn_params& params) const {
+ auto kd = Parent::SetDefault(params);
+
+ auto items_num = params.output.X().v * params.output.Y().v;
+ auto max_wg = params.engineInfo.maxWorkGroupSize;
+ auto slm_per_sg = fsv * 4;
+ auto max_slm = params.engineInfo.maxLocalMemSize;
+ auto max_sgs = max_slm / slm_per_sg;
+
+ auto max_lws = std::min(max_wg, max_sgs * simd);
+
+ auto lws = std::max(std::min(items_num, max_lws) / simd, (size_t)1) * simd;
+
+ kd.gws0 = lws;
+ kd.gws1 = CeilDiv(params.output.Feature().v, fsv);
+ kd.gws2 = params.output.Batch().v;
+
+ kd.lws0 = lws;
+ kd.lws1 = 1;
+ kd.lws2 = 1;
+
+ kd.itemsNum = 1;
+
+ return kd;
+}
+
+JitConstants MVNKernel_b_fs_yx_fsv16_imad::GetJitConstants(const mvn_params& params, DispatchData kd) const {
+ auto jits = Parent::GetJitConstants(params, kd);
+
+ auto activation_dt = GetActivationType(params);
+ jits.Merge(MakeTypeJitConstants(activation_dt, "MEAN"));
+ jits.AddConstant(MakeJitConstant("SIMD", simd));
+ jits.AddConstant(MakeJitConstant("LWS", kd.lws0));
+ jits.AddConstant(MakeJitConstant("GWS", kd.gws0));
+ jits.AddConstant(MakeJitConstant("ITEM_GROUPS", kd.itemsNum));
+
+ if (!params.fused_ops.empty()) {
+ std::vector<std::string> idx_order;
+ idx_order = { "b", "(f + set_idx)", "(output_spatial / OUTPUT_SIZE_X)", "(output_spatial % OUTPUT_SIZE_X)" };
+ auto conf = FusedOpsConfiguration("", idx_order, "normalized", activation_dt);
+ jits.Merge(MakeFusedOpsJitConstants(params, { conf }));
+ }
+ return jits;
+}
+
+MVNKernel_b_fs_yx_fsv16_imad::MultiDispatchData MVNKernel_b_fs_yx_fsv16_imad::SetDefaultForMulti(const mvn_params& params) const {
+ MultiDispatchData md;
+
+ auto items_num = params.output.X().v * params.output.Y().v;
+ auto max_wg = params.engineInfo.maxWorkGroupSize;
+ auto slm_per_sg = fsv * 4;
+ auto max_slm = params.engineInfo.maxLocalMemSize;
+ auto max_sgs = max_slm / slm_per_sg;
+
+ auto max_lws = std::min(max_wg, max_sgs * simd);
+ auto lws = std::max(std::min(items_num, max_lws) / simd, (size_t)1) * simd;
+
+ // TODO Check if larger number of work-groups does not provide benefit
+ size_t item_groups = pref_work_groups;
+ md.item_groups = item_groups;
+
+ size_t stage1_lws = lws;
+
+ md.stage_1.gws0 = stage1_lws * item_groups;
+ md.stage_1.gws1 = CeilDiv(params.output.Feature().v, fsv);
+ md.stage_1.gws2 = params.output.Batch().v;
+
+ md.stage_1.lws0 = stage1_lws;
+ md.stage_1.lws1 = 1;
+ md.stage_1.lws2 = 1;
+
+ md.stage_1.itemsNum = item_groups;
+
+ size_t stage2_lws = std::max(std::min(item_groups, max_lws) / simd, (size_t)1) * simd;
+
+ md.stage_2.gws0 = stage2_lws;
+ md.stage_2.gws1 = CeilDiv(params.output.Feature().v, fsv);
+ md.stage_2.gws2 = params.output.Batch().v;
+
+ md.stage_2.lws0 = stage2_lws;
+ md.stage_2.lws1 = 1;
+ md.stage_2.lws2 = 1;
+
+ md.stage_2.itemsNum = item_groups;
+
+ md.stage_final.gws0 = std::max(items_num / simd, (size_t)1) * simd;
+ md.stage_final.gws1 = CeilDiv(params.output.Feature().v, fsv);
+ md.stage_final.gws2 = params.output.Batch().v;
+
+ md.stage_final.lws0 = simd;
+ md.stage_final.lws1 = 1;
+ md.stage_final.lws2 = 1;
+
+ md.stage_final.itemsNum = 1;
+
+ return md;
+}
+
+KernelsData MVNKernel_b_fs_yx_fsv16_imad::GetMultiStageKernelsData(const mvn_params& params, const optional_params& options, float estimated_time) const {
+ if (!Validate(params, options))
+ return {};
+
+ constexpr size_t intermidiate_bytes = 4;
+ const mvn_params& orgParams = static_cast<const mvn_params&>(params);
+
+ auto runInfo = SetDefaultForMulti(orgParams);
+
+ size_t kernels_num = params.mvnNormalizeVariance ? 5 : 3;
+ KernelData kd = KernelData::Default<mvn_params>(params, kernels_num);
+
+ auto finalKernelName = GetKernelName(orgParams);
+ {
+ // Mean first stage
+ auto cldnn_jit = GetJitConstants(orgParams, runInfo.stage_1);
+ cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_MEAN_1", 1));
+ auto entry_point = GetEntryPoint(finalKernelName, orgParams.layerID, options);
+ auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
+ auto& kernel = kd.kernels[0];
+ FillCLKernelData(kernel,
+ runInfo.stage_1,
+ params.engineInfo,
+ finalKernelName,
+ jit,
+ entry_point,
+ "",
+ false,
+ false,
+ 0,
+ 0);
+ kernel.arguments.clear(); // Clear original output argument
+ kernel.arguments.push_back({ ArgumentDescriptor::Types::INPUT, 0 });
+ kernel.arguments.push_back({ ArgumentDescriptor::Types::INTERNAL_BUFFER, 0 });
+ kd.internalBufferSizes.push_back(
+ params.output.Batch().v * Align(params.output.Feature().v, fsv) * runInfo.item_groups * intermidiate_bytes);
+ }
+ {
+ // Mean second stage
+ auto cldnn_jit = GetJitConstants(orgParams, runInfo.stage_2);
+ cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_MEAN_2", 1));
+ auto entry_point = GetEntryPoint(finalKernelName, orgParams.layerID, options);
+ auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
+ auto& kernel = kd.kernels[1];
+ FillCLKernelData(kernel,
+ runInfo.stage_2,
+ params.engineInfo,
+ finalKernelName,
+ jit,
+ entry_point,
+ "",
+ false,
+ false,
+ 0,
+ 0);
+ kernel.arguments.clear(); // Clear original output argument
+ kernel.arguments.push_back({ ArgumentDescriptor::Types::INTERNAL_BUFFER, 0 });
+ kernel.arguments.push_back({ ArgumentDescriptor::Types::INTERNAL_BUFFER, 1 });
+ kd.internalBufferSizes.push_back(params.output.Batch().v * Align(params.output.Feature().v, fsv) * intermidiate_bytes);
+ }
+ if (params.mvnNormalizeVariance) {
+ // Variance first stage
+ auto cldnn_jit = GetJitConstants(orgParams, runInfo.stage_1);
+ cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_VAR_1", 1));
+ auto entry_point = GetEntryPoint(finalKernelName, orgParams.layerID, options);
+ auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
+ auto& kernel = kd.kernels[2];
+ FillCLKernelData(kernel,
+ runInfo.stage_1,
+ params.engineInfo,
+ finalKernelName,
+ jit,
+ entry_point,
+ "",
+ false,
+ false,
+ 0,
+ 0);
+ kernel.arguments.clear(); // Clear original output argument
+ kernel.arguments.push_back({ ArgumentDescriptor::Types::INPUT, 0 });
+ kernel.arguments.push_back({ ArgumentDescriptor::Types::INTERNAL_BUFFER, 1 });
+ kernel.arguments.push_back({ ArgumentDescriptor::Types::INTERNAL_BUFFER, 0 });
+ }
+ if (params.mvnNormalizeVariance) {
+ // Variance second stage
+ auto cldnn_jit = GetJitConstants(orgParams, runInfo.stage_2);
+ cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_VAR_2", 1));
+ auto entry_point = GetEntryPoint(finalKernelName, orgParams.layerID, options);
+ auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
+ auto& kernel = kd.kernels[3];
+ FillCLKernelData(kernel,
+ runInfo.stage_2,
+ params.engineInfo,
+ finalKernelName,
+ jit,
+ entry_point,
+ "",
+ false,
+ false,
+ 0,
+ 0);
+ kernel.arguments.clear(); // Clear original output argument
+ kernel.arguments.push_back({ ArgumentDescriptor::Types::INTERNAL_BUFFER, 0 });
+ kernel.arguments.push_back({ ArgumentDescriptor::Types::INTERNAL_BUFFER, 2 });
+ kd.internalBufferSizes.push_back(params.output.Batch().v * Align(params.output.Feature().v, fsv) * intermidiate_bytes);
+ }
+ { // Final
+ auto cldnn_jit = GetJitConstants(orgParams, runInfo.stage_final);
+ cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_MAIN", 1));
+ cldnn_jit.AddConstant(MakeJitConstant("PRECALC_MEAN", 1));
+ cldnn_jit.AddConstant(MakeJitConstant("PRECALC_VARIANCE", params.mvnNormalizeVariance));
+ auto entry_point = GetEntryPoint(finalKernelName, orgParams.layerID, options);
+ auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
+ auto& kernel = kd.kernels[kernels_num - 1];
+ FillCLKernelData(kernel,
+ runInfo.stage_final,
+ params.engineInfo,
+ finalKernelName,
+ jit,
+ entry_point,
+ "",
+ false,
+ false,
+ 1,
+ GetFusedPrimitiveInputsCount(params));
+ kernel.arguments.push_back({ ArgumentDescriptor::Types::INTERNAL_BUFFER, 1 });
+ if (params.mvnNormalizeVariance) {
+ kernel.arguments.push_back({ ArgumentDescriptor::Types::INTERNAL_BUFFER, 2 });
+ }
+ }
+ kd.intenralBufferDataType = Datatype::F32;
+ kd.estimatedTime = estimated_time;
+
+ return { kd };
+}
+
+
+KernelsData MVNKernel_b_fs_yx_fsv16_imad::GetKernelsData(const Params& params, const optional_params& optParams) const {
+ const mvn_params& orgParams = static_cast<const mvn_params&>(params);
+
+ auto max_slm = params.engineInfo.maxLocalMemSize;
+ auto slm_per_sg = fsv * 4;
+ auto max_lws = params.engineInfo.maxWorkGroupSize;
+ auto items_num = orgParams.output.X().v * orgParams.output.Y().v;
+
+ auto enough_slm = max_lws / simd * simd * slm_per_sg <= max_slm;
+ auto enough_lws = max_lws / simd >= 1;
+ auto enough_items = items_num >= max_lws / simd * simd * pref_work_groups;
+
+ if (enough_slm && enough_lws && enough_items)
+ return GetMultiStageKernelsData(orgParams, optParams, FORCE_PRIORITY_4);
+ else
+ return GetCommonKernelsData(params, optParams, FORCE_PRIORITY_4);
+}
+} // namespace kernel_selector
--- /dev/null
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#pragma once
+
+#include "mvn_kernel_base.h"
+#include <string>
+#include <vector>
+
+namespace kernel_selector {
+class MVNKernel_b_fs_yx_fsv16_imad : public MVNKernelBase {
+public:
+ using Parent = MVNKernelBase;
+ MVNKernel_b_fs_yx_fsv16_imad() : MVNKernelBase("mvn_gpu_b_fs_yx_fsv16_imad") {}
+ virtual ~MVNKernel_b_fs_yx_fsv16_imad() {}
+
+ KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+ ParamsKey GetSupportedKey() const override;
+
+protected:
+ struct MultiDispatchData {
+ DispatchData stage_1;
+ DispatchData stage_2;
+ DispatchData stage_final;
+
+ size_t item_groups;
+ };
+
+ bool Validate(const Params&, const optional_params&) const override;
+ DispatchData SetDefault(const mvn_params& params) const override;
+ JitConstants GetJitConstants(const mvn_params& params, DispatchData kd) const override;
+ std::vector<FusedOpType> GetSupportedFusedOps() const override {
+ return {
+ FusedOpType::ACTIVATION,
+ FusedOpType::QUANTIZE,
+ FusedOpType::ELTWISE,
+ FusedOpType::SCALE
+ };
+ }
+
+ KernelsData GetMultiStageKernelsData(const mvn_params& params, const optional_params&, float estimated_time) const;
+ MultiDispatchData SetDefaultForMulti(const mvn_params& params) const;
+};
+} // namespace kernel_selector
#include "mvn_kernel_selector.h"
#include "mvn_kernel_ref.h"
#include "mvn_kernel_bfyx_opt.h"
+#include "mvn_kernel_b_fs_yx_fsv16_imad.hpp"
namespace kernel_selector {
mvn_kernel_selector::mvn_kernel_selector() {
Attach<MVNKernelRef>();
Attach<MVNKernelBfyxOpt>();
+ Attach<MVNKernel_b_fs_yx_fsv16_imad>();
}
KernelsData mvn_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
return GetNaiveBestKernel(params, options, KernelType::MVN);
}
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
case WeightsLayout::g_os_is_zyx_isv16_osv16:
case WeightsLayout::giy_xs_os_xsv2_osv16__ao32:
case WeightsLayout::g_os_is_yx_isv16_osv16:
+ case WeightsLayout::os_is_yx_osv16_isv16:
return 16;
case WeightsLayout::os_i_osv8__ai8:
case WeightsLayout::iy_xs_os_xsv2_osv8__ao32:
ReorderKernelBase::DispatchData ReorderKernelBase::SetDefault(const reorder_params& params) const {
DispatchData kd;
- auto global = GetTensorFriendlyWorkGroups(params.inputs[0]);
+ auto& input = params.inputs[0];
+ DataTensor input_tensor = input;
+ // Image formats reorders use read_image and write_image functions that operate on 4 channels at once, and support only single batch,
+ // make sure that reorder size is equal to spatials sizes only
+ if (params.inputs[0].GetLayout() == DataLayout::image_2d_rgba || params.output.GetLayout() == DataLayout::image_2d_rgba) {
+ std::vector<size_t> input_sizes(4, 1);
+ input_sizes[0] = input.X().v;
+ input_sizes[1] = input.Y().v;
+ input_tensor = DataTensor(input_sizes, input.GetDType(), DataLayout::image_2d_rgba);
+ }
+
+ auto global = GetTensorFriendlyWorkGroups(input_tensor);
auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
kd.gws0 = global[0];
#include <core/common/kernel_selector_utils.h>
#include "resample_kernel_ref.h"
+#include <algorithm>
+#include <vector>
+#include <string>
+
namespace kernel_selector {
ParamsKey ResampleKernelRef::GetSupportedKey() const {
return GetCommonKernelsData(params, options);
}
+static size_t packing_factor(const resample_params& params) {
+ // TODO Add support for only input packing
+ bool in_out_8bit = (params.inputs[0].GetDType() == Datatype::UINT8 || params.inputs[0].GetDType() == Datatype::INT8) &&
+ (params.output.GetDType() == Datatype::UINT8 || params.output.GetDType() == Datatype::INT8);
+
+ if (!in_out_8bit)
+ return 1;
+
+ auto get_layout_packing_factor = [](const DataLayout& layout) -> size_t {
+ switch (layout) {
+ case DataLayout::b_fs_yx_fsv16:
+ return 16;
+ case DataLayout::b_fs_yx_fsv4:
+ return 4;
+ case DataLayout::byxf_af32:
+ return 16;
+ default:
+ break;
+ }
+ return 1;
+ };
+
+ size_t input_factor = get_layout_packing_factor(params.inputs[0].GetLayout());
+ size_t output_factor = get_layout_packing_factor(params.output.GetLayout());
+
+ return std::min(input_factor, output_factor);
+}
+
+static bool use_packing(const resample_params& params) {
+ if (params.resampleType != ResampleType::NEAREST_NEIGHBOR)
+ return false;
+
+ auto pack = packing_factor(params);
+ if (pack == 1)
+ return false;
+
+ if (params.inputs[0].Feature().v % pack != 0 || params.output.Feature().v % pack != 0 ||
+ params.inputs[0].Feature().pad.before % pack != 0 || params.output.Feature().pad.before % pack != 0)
+ return false;
+
+ auto packed_work_items = params.output.X().v * params.output.Y().v * params.output.Z().v
+ * CeilDiv(params.output.Feature().v, pack) * params.output.Batch().v;
+ // TODO Loosen this requirement to minimum EUs needed to saturate cache bandwidth
+ constexpr size_t max_work_items_per_eu = 32 * 7;
+ auto minimum_work_items = params.engineInfo.computeUnitsCount * max_work_items_per_eu;
+
+ if (packed_work_items < minimum_work_items)
+ return false;
+
+ return true;
+}
+
JitConstants ResampleKernelRef::GetJitConstants(const resample_params& params) const {
JitConstants jit = ResampleKernelBase::GetJitConstants(params);
+ if (use_packing(params)) {
+ jit.AddConstant(MakeJitConstant("PACK_SIZE", packing_factor(params)));
+ jit.AddConstant(MakeJitConstant("FEATURE_PACKED_MODE", "1"));
+ }
+
if (!params.fused_ops.empty()) {
std::vector<std::string> idx_order;
if (DataTensor::ChannelsCount(params.output.GetLayout()) == 4) {
return jit;
}
+
+ResampleKernelBase::DispatchData ResampleKernelRef::SetDefault(const resample_params& arg) const {
+ auto dispatch = Parent::SetDefault(arg);
+
+ if (use_packing(arg)) {
+ auto pack = packing_factor(arg);
+ std::vector<size_t> global;
+ std::vector<size_t> local;
+
+ global = { arg.output.X().v, arg.output.Y().v * arg.output.Z().v, CeilDiv(arg.output.Feature().v, pack) * arg.output.Batch().v };
+ local = GetOptimalLocalWorkGroupSizes(global, arg.engineInfo);
+
+ dispatch.gws0 = global[0];
+ dispatch.gws1 = global[1];
+ dispatch.gws2 = global[2];
+
+ dispatch.lws0 = local[0];
+ dispatch.lws1 = local[1];
+ dispatch.lws2 = local[2];
+ }
+
+ return dispatch;
+}
} // namespace kernel_selector
FusedOpType::SCALE,
FusedOpType::ACTIVATION };
}
+
+protected:
+ DispatchData SetDefault(const resample_params& arg) const override;
};
} // namespace kernel_selector
}
-KERNEL (concatenation_gpu_ref)(__global UNIT_TYPE* input, __global UNIT_TYPE* output, uint output_offset_in_concat_axis)
+KERNEL (concatenation_gpu_ref)(__global INPUT0_TYPE* input, __global OUTPUT_TYPE* output, uint output_offset_in_concat_axis)
{
const uint x = (uint)get_global_id(0) % INPUT0_SIZE_X;
const uint y = (uint)get_global_id(0) / INPUT0_SIZE_X;
uint input_offset = FUNC_CALL(get_input_index)(b, f, w, z, y, x);
uint output_offset = FUNC_CALL(get_output_index)(out_b, out_f, out_w, out_z, out_y, out_x);
- output[output_offset] = ACTIVATION(input[input_offset], ACTIVATION_PARAMS);
+ output[output_offset] = TO_OUTPUT_TYPE(ACTIVATION(input[input_offset], ACTIVATION_PARAMS));
}
--- /dev/null
+// Copyright (c) 2018-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "include/common.cl"
+#include "include/fetch.cl"
+#include "include/imad.cl"
+#include "include/mmad.cl"
+
+#if QUANTIZATION_TERM
+ #define ACCUMULATOR_TYPE int
+ #define TO_ACCUMULATOR_TYPE(x) convert_int(x)
+ #define ACTIVATION_TYPE float
+ #define TO_ACTIVATION_TYPE(x) convert_float(x)
+#else
+ #define ACCUMULATOR_TYPE INPUT0_TYPE
+ #define TO_ACCUMULATOR_TYPE(x) TO_INPUT0_TYPE(x)
+ #define ACTIVATION_TYPE INPUT0_TYPE
+ #define TO_ACTIVATION_TYPE(x) TO_INPUT0_TYPE(x)
+#endif
+
+#define MAKE_VECTOR_TYPE(elem_type, size) CAT(elem_type, size)
+#define AS_TYPE_N_(type, n, x) as_##type##n(x)
+#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
+#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
+#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
+
+__attribute__((intel_reqd_sub_group_size(16)))
+KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
+ const __global INPUT0_TYPE *conv_input,
+ __global OUTPUT_TYPE *output,
+ const __global FILTER_TYPE *weights,
+#if BIAS_TERM
+ const __global BIAS_TYPE *biases,
+#endif
+#if HAS_FUSED_OPS_DECLS
+ FUSED_OPS_DECLS,
+#endif
+ uint split_idx)
+{
+ #define LUT_VALUE_CLAMP(x) ((x) < (OUT_BLOCK_WIDTH - 1) * STRIDE_SIZE_X + 1 ? (x) : 0)
+ const int tmp[16] = {
+ LUT_VALUE_CLAMP(0),
+ LUT_VALUE_CLAMP(1),
+ LUT_VALUE_CLAMP(2),
+ LUT_VALUE_CLAMP(3),
+ LUT_VALUE_CLAMP(4),
+ LUT_VALUE_CLAMP(5),
+ LUT_VALUE_CLAMP(6),
+ LUT_VALUE_CLAMP(7),
+ LUT_VALUE_CLAMP(8),
+ LUT_VALUE_CLAMP(9),
+ LUT_VALUE_CLAMP(10),
+ LUT_VALUE_CLAMP(11),
+ LUT_VALUE_CLAMP(12),
+ LUT_VALUE_CLAMP(13),
+ LUT_VALUE_CLAMP(14),
+ LUT_VALUE_CLAMP(15)
+ };
+ #undef LUT_VALUE_CLAMP
+
+#if FEATURE_LWS_SPLIT != 1
+ const uint subgroup_id = get_sub_group_id();
+#else
+ const uint subgroup_id = 0;
+#endif
+ const uint subgroup_local_id = get_sub_group_local_id();
+
+ const uint out_x = (uint)get_global_id(0) * OUT_BLOCK_WIDTH;
+ const uint out_y = get_global_id(1);
+ const uint out_b = (uint)(get_group_id(2) * 32) / ALIGN(OUTPUT_FEATURE_NUM, 32);
+ const uint out_fg = (uint)(get_group_id(2) * 32) % ALIGN(OUTPUT_FEATURE_NUM, 32);
+ const uint out_f = out_fg + subgroup_local_id;
+
+ const uint feature_offset = subgroup_id * INPUT0_FEATURE_NUM / FEATURE_LWS_SPLIT;
+
+ ACCUMULATOR_TYPE dotProd[OUT_BLOCK_WIDTH * 2] = { 0 };
+
+ const int input_x = out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
+ const int input_y = out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
+
+ uint filter_idx = GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, out_f, feature_offset, 0, 0);
+ uint filter_idx2 = GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, out_f + 16, feature_offset, 0, 0);
+
+ __attribute__((opencl_unroll_hint(1)))
+ for(uint k = 0; k < CEIL_DIV(INPUT0_FEATURE_NUM, 16)/FEATURE_LWS_SPLIT; k++ ) {
+ uint4 weights_val = vload4(0, (__global uint*)(weights + filter_idx));
+ uint4 weights_val2 = vload4(0, (__global uint *)(weights + filter_idx2));
+
+ uint input_idx = GET_DATA_B_FS_YX_FSV16_INDEX(INPUT0, out_b, feature_offset + k * 16, input_y, input_x + tmp[get_sub_group_local_id()]);
+ uint4 input_val0 = vload4(0, (__global uint *)(conv_input + input_idx));
+
+ __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+ for(uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
+ const uint ow_offset = ow + OUT_BLOCK_WIDTH;
+ dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s0, ow * STRIDE_SIZE_X)), as_char4(weights_val.s0)));
+ dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s1, ow * STRIDE_SIZE_X)), as_char4(weights_val.s1)));
+ dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s2, ow * STRIDE_SIZE_X)), as_char4(weights_val.s2)));
+ dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s3, ow * STRIDE_SIZE_X)), as_char4(weights_val.s3)));
+
+ dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s0, ow * STRIDE_SIZE_X)), as_char4(weights_val2.s0)));
+ dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s1, ow * STRIDE_SIZE_X)), as_char4(weights_val2.s1)));
+ dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s2, ow * STRIDE_SIZE_X)), as_char4(weights_val2.s2)));
+ dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s3, ow * STRIDE_SIZE_X)), as_char4(weights_val2.s3)));
+ }
+
+ filter_idx += 16 * 16;
+ filter_idx2 += 16 * 16;
+ }
+
+#if FEATURE_LWS_SPLIT != 1
+ __local ACCUMULATOR_TYPE partial_acc[16 * OUT_BLOCK_WIDTH * (FEATURE_LWS_SPLIT - 1) * 2];
+ if (subgroup_id == 0) {
+ __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+ for (uint i = 0; i < OUT_BLOCK_WIDTH; i++) {
+ partial_acc[16 * OUT_BLOCK_WIDTH + i * 16 + subgroup_local_id] = dotProd[i + OUT_BLOCK_WIDTH];
+ }
+ } else if (subgroup_id == 1) {
+ __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+ for (uint i = 0; i < OUT_BLOCK_WIDTH; i++) {
+ partial_acc[i * 16 + subgroup_local_id] = dotProd[i];
+ dotProd[i] = dotProd[i + OUT_BLOCK_WIDTH];
+ }
+ } else if (subgroup_id == 2) {
+ __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+ for (uint i = 0; i < OUT_BLOCK_WIDTH; i++) {
+ partial_acc[2 * 16 * OUT_BLOCK_WIDTH + i * 16 + subgroup_local_id] = dotProd[i];
+ partial_acc[3 * 16 * OUT_BLOCK_WIDTH + i * 16 + subgroup_local_id] = dotProd[i + OUT_BLOCK_WIDTH];
+ }
+ } else if (subgroup_id == 3) {
+ __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+ for (uint i = 0; i < OUT_BLOCK_WIDTH; i++) {
+ partial_acc[4 * 16 * OUT_BLOCK_WIDTH + i * 16 + subgroup_local_id] = dotProd[i];
+ partial_acc[5 * 16 * OUT_BLOCK_WIDTH + i * 16 + subgroup_local_id] = dotProd[i + OUT_BLOCK_WIDTH];
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (subgroup_id >= 2)
+ return;
+ __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+ for (uint i = 0; i < OUT_BLOCK_WIDTH; i++) {
+ dotProd[i] += partial_acc[(i + subgroup_id * OUT_BLOCK_WIDTH) * 16 + subgroup_local_id];
+ dotProd[i] += partial_acc[(i + (subgroup_id + 2) * OUT_BLOCK_WIDTH) * 16 + subgroup_local_id];
+ dotProd[i] += partial_acc[(i + (subgroup_id + 4) * OUT_BLOCK_WIDTH) * 16 + subgroup_local_id];
+ }
+#endif
+
+#if FEATURE_LWS_SPLIT == 1
+# define OUTPUT_FEATURES_PER_WI 2
+# if BIAS_TERM
+ BIAS_TYPE bias[OUTPUT_FEATURES_PER_WI] = { biases[out_f], biases[out_f + 16] };
+# endif
+#else
+# define OUTPUT_FEATURES_PER_WI 1
+# if BIAS_TERM
+ BIAS_TYPE bias[OUTPUT_FEATURES_PER_WI] = { biases[out_f + subgroup_id * 16] };
+# endif
+#endif
+
+ for (uint j = 0; j < OUTPUT_FEATURES_PER_WI; j++) {
+ uint out_f_offset = subgroup_id * 16 + j * 16;
+
+#if OUTPUT_FEATURE_NUM % 32 != 0 && OUTPUT_FEATURE_NUM % 32 <= 16
+ if (out_fg + 32 > OUTPUT_FEATURE_NUM && out_f_offset >= OUTPUT_FEATURE_NUM % 32)
+ break;
+#endif
+
+ const uint dst_index = GET_DATA_B_FS_YX_FSV16_INDEX(OUTPUT, out_b, out_f + out_f_offset, out_y, out_x);
+#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
+ FUSED_OPS_PRELOAD
+#endif
+ __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+ for (uint i = 0; i < OUT_BLOCK_WIDTH; i++) {
+
+#if OUTPUT_SIZE_X % OUT_BLOCK_WIDTH != 0
+ if (out_x + OUT_BLOCK_WIDTH > OUTPUT_SIZE_X && i >= OUTPUT_SIZE_X % OUT_BLOCK_WIDTH)
+ break;
+#endif
+ ACTIVATION_TYPE dequantized = (ACTIVATION_TYPE)0;
+#if BIAS_TERM
+ dequantized = (ACTIVATION_TYPE)dotProd[OUT_BLOCK_WIDTH * j + i] + bias[j];
+#else
+ dequantized = (ACTIVATION_TYPE)dotProd[OUT_BLOCK_WIDTH * j + i];
+#endif
+ OUTPUT_TYPE result;
+#if HAS_FUSED_OPS
+ #if FUSED_OPS_CAN_USE_PRELOAD
+ FUSED_OPS_CALC
+ #else
+ FUSED_OPS
+ #endif
+ result = FUSED_OPS_RESULT;
+#else
+ result = TO_OUTPUT_TYPE(dequantized);
+#endif
+
+#if OUTPUT_FEATURE_NUM % 16 != 0
+ if (out_fg + out_f_offset + 16 > OUTPUT_FEATURE_NUM && subgroup_local_id >= OUTPUT_FEATURE_NUM % 16)
+ result = (OUTPUT_TYPE)0;
+#endif
+ output[dst_index + i * 16] = result;
+ }
+ }
+
+#undef OUTPUT_FEATURES_PER_WI
+}
+
+#undef AS_INPUT0_TYPE_4
+#undef AS_TYPE_N
+#undef AS_TYPE_N_
+#undef MAKE_VECTOR_TYPE
+#undef TO_ACTIVATION_TYPE
+#undef ACTIVATION_TYPE
+#undef TO_ACCUMULATOR_TYPE
+#undef ACCUMULATOR_TYPE
+
+#undef CEIL_DIV
+#undef ALIGN
--- /dev/null
+// Copyright (c) 2018-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/common.cl"
+#include "include/fetch.cl"
+#include "include/imad.cl"
+#include "include/mmad.cl"
+
+#if QUANTIZATION_TERM
+#define ACCUMULATOR_TYPE int
+#define TO_ACCUMULATOR_TYPE(x) convert_int(x)
+#define ACTIVATION_TYPE float
+#define TO_ACTIVATION_TYPE(x) convert_float(x)
+#else
+#define ACCUMULATOR_TYPE INPUT0_TYPE
+#define TO_ACCUMULATOR_TYPE(x) TO_INPUT0_TYPE(x)
+#define ACTIVATION_TYPE INPUT0_TYPE
+#define TO_ACTIVATION_TYPE(x) TO_INPUT0_TYPE(x)
+#endif
+
+#define MAKE_VECTOR_TYPE(elem_type, size) CAT(elem_type, size)
+#define AS_TYPE_N_(type, n, x) as_##type##n(x)
+#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
+#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
+#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
+
+// int8 conv_input and weights data is packed to int32 "batches",
+// int/uint pointers here instead of INPUT0_TYPE/FILTER_TYPE for convenience
+__attribute__((intel_reqd_sub_group_size(16)))
+__attribute__((reqd_work_group_size(1, 1, 16)))
+KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_3x3)(
+ const __global INPUT0_TYPE *conv_input,
+ __global OUTPUT_TYPE *output,
+ const __global FILTER_TYPE *weights,
+#if BIAS_TERM
+ const __global BIAS_TYPE *biases,
+#endif
+#if HAS_FUSED_OPS_DECLS
+ FUSED_OPS_DECLS,
+#endif
+ uint split_idx) {
+
+ #define LUT_VALUE_CLAMP(x) ((x) < (OUT_BLOCK_WIDTH - 1) * STRIDE_SIZE_X + FILTER_SIZE_X ? (x) : 0)
+ const int tmp[16] = {
+ LUT_VALUE_CLAMP(0),
+ LUT_VALUE_CLAMP(1),
+ LUT_VALUE_CLAMP(2),
+ LUT_VALUE_CLAMP(3),
+ LUT_VALUE_CLAMP(4),
+ LUT_VALUE_CLAMP(5),
+ LUT_VALUE_CLAMP(6),
+ LUT_VALUE_CLAMP(7),
+ LUT_VALUE_CLAMP(8),
+ LUT_VALUE_CLAMP(9),
+ LUT_VALUE_CLAMP(10),
+ LUT_VALUE_CLAMP(11),
+ LUT_VALUE_CLAMP(12),
+ LUT_VALUE_CLAMP(13),
+ LUT_VALUE_CLAMP(14),
+ LUT_VALUE_CLAMP(15)
+ };
+ #undef LUT_VALUE_CLAMP
+
+ const uint out_x = (uint)get_global_id(0) * OUT_BLOCK_WIDTH;
+ const uint out_y = get_global_id(1);
+ const uint out_b = (uint)(get_group_id(2) * OFM_SIZE_PER_SIMD) / ALIGN(OUTPUT_FEATURE_NUM, OFM_SIZE_PER_SIMD);
+ const uint out_fg = (uint)(get_group_id(2) * OFM_SIZE_PER_SIMD) % ALIGN(OUTPUT_FEATURE_NUM, OFM_SIZE_PER_SIMD);
+ const uint out_f = out_fg + get_sub_group_local_id();
+ ACCUMULATOR_TYPE dotProd[OUT_BLOCK_WIDTH * OFM_BLOCKS_PER_SIMD] = {0};
+ const int input_x = out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
+
+ const int input_y = out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
+
+ uint filter_idx = GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, out_f, 0, 0, 0);
+#if OFM_BLOCKS_PER_SIMD == 2
+ uint filter_idx2 = GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, out_f + 16, 0, 0, 0);
+#endif
+
+ __attribute__((opencl_unroll_hint(1)))
+ for (uint k = 0; k < CEIL_DIV(INPUT0_FEATURE_NUM, 16); k++) {
+ __attribute__((opencl_unroll_hint(1)))
+ for (uint j = 0; j < FILTER_SIZE_Y; j++) {
+ uint input_idx = GET_DATA_B_FS_YX_FSV16_INDEX(INPUT0, out_b, k * 16, input_y + j, input_x + tmp[get_sub_group_local_id()]);
+ uint4 input_val0 = vload4(0, (__global uint *)(conv_input + input_idx));
+
+ __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
+ for (uint i = 0; i < FILTER_SIZE_X; i++) {
+
+ uint4 weights_val = vload4(0, (__global uint *)(weights + filter_idx));
+#if OFM_BLOCKS_PER_SIMD == 2
+ uint4 weights_val3 = vload4(0, (__global uint *)(weights + filter_idx2));
+#endif
+
+ __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+ for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
+ const uint ow_offset = ow + OUT_BLOCK_WIDTH;
+ dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s0, ow * STRIDE_SIZE_X + i)), as_char4(weights_val.s0)));
+ dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s1, ow * STRIDE_SIZE_X + i)), as_char4(weights_val.s1)));
+ dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s2, ow * STRIDE_SIZE_X + i)), as_char4(weights_val.s2)));
+ dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s3, ow * STRIDE_SIZE_X + i)), as_char4(weights_val.s3)));
+
+#if OFM_BLOCKS_PER_SIMD == 2
+ dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s0, ow * STRIDE_SIZE_X + i)), as_char4(weights_val3.s0)));
+ dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s1, ow * STRIDE_SIZE_X + i)), as_char4(weights_val3.s1)));
+ dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s2, ow * STRIDE_SIZE_X + i)), as_char4(weights_val3.s2)));
+ dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s3, ow * STRIDE_SIZE_X + i)), as_char4(weights_val3.s3)));
+#endif
+ }
+ filter_idx += 16 * 16;
+#if OFM_BLOCKS_PER_SIMD == 2
+ filter_idx2 += 16 * 16;
+#endif
+ }
+ }
+ }
+
+#if BIAS_TERM
+ BIAS_TYPE bias[OFM_BLOCKS_PER_SIMD] = { biases[out_f]
+#if OFM_BLOCKS_PER_SIMD == 2
+ , biases[out_f + 16]
+#endif
+ };
+#endif
+ __attribute__((opencl_unroll_hint(OFM_BLOCKS_PER_SIMD)))
+ for (uint j = 0; j < OFM_BLOCKS_PER_SIMD; j++) {
+ const uint dst_index = GET_DATA_B_FS_YX_FSV16_INDEX(OUTPUT, out_b, out_f + j * 16, out_y, out_x);
+#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
+ FUSED_OPS_PRELOAD;
+#endif
+ __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+ for (uint i = 0; i < OUT_BLOCK_WIDTH; i++) {
+
+#if OUTPUT_SIZE_X % OUT_BLOCK_WIDTH != 0
+ if (out_x + OUT_BLOCK_WIDTH > OUTPUT_SIZE_X && i >= OUTPUT_SIZE_X % OUT_BLOCK_WIDTH)
+ break;
+#endif
+ ACTIVATION_TYPE dequantized = (ACTIVATION_TYPE)0;
+#if BIAS_TERM
+ dequantized = (ACTIVATION_TYPE)dotProd[OUT_BLOCK_WIDTH * j + i] + bias[j];
+#else
+ dequantized = (ACTIVATION_TYPE)dotProd[OUT_BLOCK_WIDTH * j + i];
+#endif
+ OUTPUT_TYPE result;
+#if HAS_FUSED_OPS
+ #if FUSED_OPS_CAN_USE_PRELOAD
+ FUSED_OPS_CALC;
+ #else
+ FUSED_OPS;
+ #endif
+ result = FUSED_OPS_RESULT;
+#else
+ result = TO_OUTPUT_TYPE(dequantized);
+#endif
+
+#if OUTPUT_FEATURE_NUM % 16 != 0
+ if (out_fg + j * 16 + 16 > OUTPUT_FEATURE_NUM && get_sub_group_local_id() >= OUTPUT_FEATURE_NUM % 16)
+ result = (OUTPUT_TYPE)0;
+#endif
+ output[dst_index + i * 16] = result;
+ }
+ }
+}
+
+#undef AS_INPUT0_TYPE_4
+#undef AS_TYPE_N
+#undef AS_TYPE_N_
+#undef MAKE_VECTOR_TYPE
+#undef TO_ACTIVATION_TYPE
+#undef ACTIVATION_TYPE
+#undef TO_ACCUMULATOR_TYPE
+#undef ACCUMULATOR_TYPE
+
+#undef CEIL_DIV
+#undef ALIGN
--- /dev/null
+// Copyright (c) 2018-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "include/common.cl"
+#include "include/fetch.cl"
+#include "include/imad.cl"
+#include "include/mmad.cl"
+
+#if QUANTIZATION_TERM
+ #define ACCUMULATOR_TYPE int
+ #define TO_ACCUMULATOR_TYPE(x) convert_int(x)
+ #define ACTIVATION_TYPE float
+ #define TO_ACTIVATION_TYPE(x) convert_float(x)
+#else
+ #define ACCUMULATOR_TYPE INPUT0_TYPE
+ #define TO_ACCUMULATOR_TYPE(x) TO_INPUT0_TYPE(x)
+ #define ACTIVATION_TYPE INPUT0_TYPE
+ #define TO_ACTIVATION_TYPE(x) TO_INPUT0_TYPE(x)
+#endif
+
+#define MAKE_VECTOR_TYPE(elem_type, size) CAT(elem_type, size)
+#define AS_TYPE_N_(type, n, x) as_##type##n(x)
+#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
+#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
+
+__attribute__((intel_reqd_sub_group_size(16)))
+KERNEL(convolution_gpu_b_fs_yx_fsv16_3x3_ks)(
+ const __global INPUT0_TYPE *conv_input,
+ __global OUTPUT_TYPE *output,
+ const __global FILTER_TYPE *weights,
+#if BIAS_TERM
+ const __global BIAS_TYPE *biases,
+#endif
+#if HAS_FUSED_OPS_DECLS
+ FUSED_OPS_DECLS,
+#endif
+ uint split_idx)
+{
+#if OUT_BLOCK_WIDTH == 7 && STRIDE_SIZE_X == 1
+ const int tmp[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0};
+#elif OUT_BLOCK_WIDTH == 7 && STRIDE_SIZE_X == 2
+ const int tmp[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0};
+#elif OUT_BLOCK_WIDTH == 8 && STRIDE_SIZE_X == 1
+ const int tmp[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0};
+#else // OUT_BLOCK_WIDTH == 8 && STRIDE_SIZE_X == 2
+ const int tmp[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+#endif
+
+ const uint out_x = (uint)get_global_id(0) * OUT_BLOCK_WIDTH;
+ const uint out_y = get_global_id(1);
+ const uint out_f = (uint)(get_group_id(2) * 32 + get_sub_group_local_id());
+ const uint subgroup_id = get_sub_group_id();
+ const uint subgroup_local_id = get_sub_group_local_id();
+ const uint feature_offset = subgroup_id * INPUT0_FEATURE_NUM / 4;
+ const uint out_b = (uint)(get_group_id(2) * 32) / OUTPUT_FEATURE_NUM;
+
+ ACCUMULATOR_TYPE dotProd[OUT_BLOCK_WIDTH * 2] = { 0 };
+ const int input_x = out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
+ const int input_y = out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
+
+ uint filter_idx = GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, out_f, feature_offset, 0, 0);
+ uint diff_filter_idx = 16*3*3*FILTER_IFM_NUM;
+
+ __attribute__((opencl_unroll_hint(1)))
+ for(uint k = 0; k < CEIL_DIV(INPUT0_FEATURE_NUM, 16)/4; k++ ) {
+ __attribute__((opencl_unroll_hint(1)))
+ for(uint j = 0; j < FILTER_SIZE_Y; j++) {
+ uint input_idx = GET_DATA_B_FS_YX_FSV16_INDEX(INPUT0, out_b, feature_offset + k * 16, input_y + j, input_x + tmp[subgroup_local_id]);
+ uint4 input_val0 = vload4(0, (__global uint *)(conv_input + input_idx));
+
+ __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
+ for(uint i = 0; i < FILTER_SIZE_X; i++) {
+
+ uint4 weights_val = vload4(0, (__global uint*)(weights + filter_idx));
+ uint4 weights_val3 = vload4(0, (__global uint *)(weights + filter_idx + diff_filter_idx));
+
+ __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+ for(uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
+ const uint ow_offset = ow + OUT_BLOCK_WIDTH;
+ dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s0, ow * STRIDE_SIZE_X + i)), as_char4(weights_val.s0)));
+ dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s1, ow * STRIDE_SIZE_X + i)), as_char4(weights_val.s1)));
+ dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s2, ow * STRIDE_SIZE_X + i)), as_char4(weights_val.s2)));
+ dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s3, ow * STRIDE_SIZE_X + i)), as_char4(weights_val.s3)));
+
+ dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s0, ow * STRIDE_SIZE_X + i)), as_char4(weights_val3.s0)));
+ dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s1, ow * STRIDE_SIZE_X + i)), as_char4(weights_val3.s1)));
+ dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s2, ow * STRIDE_SIZE_X + i)), as_char4(weights_val3.s2)));
+ dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s3, ow * STRIDE_SIZE_X + i)), as_char4(weights_val3.s3)));
+ }
+ filter_idx += 16 * 16;
+ }
+ }
+ }
+
+ //k slicing summing up with SLM
+ __local ACCUMULATOR_TYPE partial_acc[16 * OUT_BLOCK_WIDTH * 6];
+ if(subgroup_id == 0)
+ {
+ __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+ for(uint i = 0; i < OUT_BLOCK_WIDTH; i++)
+ {
+ partial_acc[16 * OUT_BLOCK_WIDTH + i * 16 + subgroup_local_id] = dotProd[i + OUT_BLOCK_WIDTH];
+ }
+ }
+ else if(subgroup_id == 1)
+ {
+ __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+ for(uint i = 0; i < OUT_BLOCK_WIDTH; i++)
+ {
+ partial_acc[i * 16 + subgroup_local_id] = dotProd[i];
+ dotProd[i] = dotProd[i + OUT_BLOCK_WIDTH];
+ }
+ }
+ else if (subgroup_id == 2)
+ {
+ __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+ for (uint i = 0; i < OUT_BLOCK_WIDTH; i++)
+ {
+ partial_acc[2 * 16 * OUT_BLOCK_WIDTH + i * 16 + subgroup_local_id] = dotProd[i];
+ partial_acc[3 * 16 * OUT_BLOCK_WIDTH + i * 16 + subgroup_local_id] = dotProd[i + OUT_BLOCK_WIDTH];
+
+ }
+ }
+ else if (subgroup_id == 3)
+ {
+ __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+ for (uint i = 0; i < OUT_BLOCK_WIDTH; i++)
+ {
+ partial_acc[4 * 16 * OUT_BLOCK_WIDTH + i * 16 + subgroup_local_id] = dotProd[i];
+ partial_acc[5 * 16 * OUT_BLOCK_WIDTH + i * 16 + subgroup_local_id] = dotProd[i + OUT_BLOCK_WIDTH];
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (subgroup_id < 2) {
+ __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+ for (uint i = 0; i < OUT_BLOCK_WIDTH; i++)
+ {
+ dotProd[i] += partial_acc[(i + subgroup_id * OUT_BLOCK_WIDTH) * 16 + subgroup_local_id];
+ dotProd[i] += partial_acc[(i + (subgroup_id + 2) * OUT_BLOCK_WIDTH) * 16 + subgroup_local_id];
+ dotProd[i] += partial_acc[(i + (subgroup_id + 4) * OUT_BLOCK_WIDTH) * 16 + subgroup_local_id];
+ }
+#if BIAS_TERM
+ BIAS_TYPE bias = biases[out_f + get_sub_group_id() * 16];
+#endif
+
+#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
+ FUSED_OPS_PRELOAD;
+#endif
+ const uint dst_index = GET_DATA_B_FS_YX_FSV16_INDEX(OUTPUT, out_b, out_f + subgroup_id * 16, out_y, out_x);
+ __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+ for (uint i = 0; i < OUT_BLOCK_WIDTH; i++)
+ {
+ ACTIVATION_TYPE dequantized = (ACTIVATION_TYPE)0;
+#if BIAS_TERM
+ dequantized = (ACTIVATION_TYPE)dotProd[i] + bias;
+#else
+ dequantized = (ACTIVATION_TYPE)dotProd[i];
+#endif
+#if HAS_FUSED_OPS
+ #if FUSED_OPS_CAN_USE_PRELOAD
+ FUSED_OPS_CALC;
+ #else
+ FUSED_OPS;
+ #endif
+ output[dst_index + i * 16] = FUSED_OPS_RESULT;
+#else
+ output[dst_index + i * 16] = TO_OUTPUT_TYPE(dequantized);
+#endif
+ }
+ }
+}
+
+#undef AS_INPUT0_TYPE_4
+#undef AS_TYPE_N
+#undef AS_TYPE_N_
+#undef MAKE_VECTOR_TYPE
+#undef TO_ACTIVATION_TYPE
+#undef ACTIVATION_TYPE
+#undef TO_ACCUMULATOR_TYPE
+#undef ACCUMULATOR_TYPE
+
+#undef CEIL_DIV
--- /dev/null
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "include/imad.cl"
+#include "include/data_types.cl"
+#include "include/fetch.cl"
+#include "include/mmad.cl"
+
+// ======================================================================================
+// Host side jit-constants:
+// ======================================================================================
+// SIMD [{8, 16}] - Sub-group/simd size for the kernel. Used as third dimension of
+// local work size.
+// TILE_X [uint] - Number of output values along x dimension calculated by single
+// work-item/sub-group.
+// LWS0 [uint] - Local work size 0th dimension.
+// LWS1 [uint] - Local work size 1st dimension.
+// FILTER_BLOCKED - Number of filter spatial elements to process using IMAD. Must be less
+// or equal to total filter spatial size.
+// Currently only supported to be multiple of 4.
+// ======================================================================================
+// Supported operations:
+// input/output format: any b_fs_yx_fsv<k> - where <k> >= SIMD,
+// input and output formats must be the same
+// weights format: os_i_yxs_oxv<k>_yxsv4 - where <k> same as in input format
+// input data types: uchar8, char8
+// weights data types: uchar8, char8
+// output data types: uchar8, char8, half, float
+// asymetric quantization: weights zero points, compensation term
+// ======================================================================================
+
+#if OUTPUT_LAYOUT_B_FS_YX_FSV16
+# define FSV 16
+#elif OUTPUT_LAYOUT_B_FS_YX_FSV32
+# define FSV 32
+#else
+# error convolution_gpu_b_fs_yx_fsv_16_32_imad_dw.cl - unsupported output layout.
+#endif
+
+#define F_PER_WI ((FSV) / (SIMD))
+
+#define DEQUANTIZED_TYPE float
+#define DEQUANTIZED_TYPE2 MAKE_VECTOR_TYPE(DEQUANTIZED_TYPE, 2)
+#define DEQUANTIZED_TYPE4 MAKE_VECTOR_TYPE(DEQUANTIZED_TYPE, 4)
+
+#define INPUT_TYPE INPUT0_TYPE
+#define INPUT_TYPE2 MAKE_VECTOR_TYPE(INPUT0_TYPE, 2)
+#define INPUT_TYPE4 MAKE_VECTOR_TYPE(INPUT0_TYPE, 4)
+#define INPUT_TYPE8 MAKE_VECTOR_TYPE(INPUT0_TYPE, 8)
+#define INPUT_TYPE16 MAKE_VECTOR_TYPE(INPUT0_TYPE, 16)
+
+#define FILTER_TYPE4 MAKE_VECTOR_TYPE(FILTER_TYPE, 4)
+
+#define OUTPUT_TYPE2 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 2)
+#define OUTPUT_TYPE4 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4)
+#define OUTPUT_TYPE8 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8)
+#define OUTPUT_TYPE16 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 16)
+
+#define AS_INPUT_TYPE(val) CAT(as_, INPUT_TYPE)(val)
+#define AS_INPUT_TYPE2(val) CAT(as_, INPUT_TYPE2)(val)
+#define AS_INPUT_TYPE4(val) CAT(as_, INPUT_TYPE4)(val)
+#define AS_INPUT_TYPE8(val) CAT(as_, INPUT_TYPE8)(val)
+#define AS_INPUT_TYPE16(val) CAT(as_, INPUT_TYPE16)(val)
+
+#define AS_FILTER_TYPE4(val) CAT(as_, FILTER_TYPE4)(val)
+
+#define TO_DEQUANTIZED_TYPE(val) CAT(convert_, DEQUANTIZED_TYPE)(val)
+
+#define GET_INPUT_INDEX(b, f, y, x) INPUT0_GET_INDEX(b, f, y, x)
+#if FSV == 16
+# define GET_WEIGHTS_INDEX(g, o, i, y, x) GET_FILTER_GS_OI_YXS_GSV16_YXSV4_INDEX(FILTER, g, 0, 0, y, x)
+#else
+# define GET_WEIGHTS_INDEX(g, o, i, y, x) GET_FILTER_GS_OI_YXS_GSV16_YXSV4_INDEX(FILTER, g, 0, 0, y, x)
+#endif
+#define GET_OUTPUT_INDEX(b, f, y, x) OUTPUT_GET_INDEX(b, f, y, x)
+#define GET_BIAS_INDEX(b, f, y, x) BIAS_GET_INDEX(b, f, y, x)
+
+#define INPUT_X_PITCH FSV
+#define INPUT_Y_PITCH (FSV * (INPUT0_SIZE_X + INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X))
+
+#define WEIGHTS_YXS_PITCH (4 * FSV)
+
+#define FILTER_SPATIAL_SIZE (FILTER_SIZE_X * FILTER_SIZE_Y)
+
+#if OUTPUT_TYPE_SIZE == 1
+# define OUTPUT_BLOCK_WRITE(ptr, val) BLOCK_WRITE_UC_1((__global uchar*)(ptr), as_uchar(val));
+# define OUTPUT_BLOCK_WRITE2(ptr, val) BLOCK_WRITE_UC_2((__global uchar*)(ptr), as_uchar2(val));
+# define OUTPUT_BLOCK_WRITE4(ptr, val) BLOCK_WRITE_UC_4((__global uchar*)(ptr), as_uchar4(val));
+# define OUTPUT_BLOCK_WRITE8(ptr, val) BLOCK_WRITE_UC_8((__global uchar*)(ptr), as_uchar8(val));
+# define OUTPUT_BLOCK_WRITE16(ptr, val) BLOCK_WRITE_UC_16((__global uchar*)(ptr), as_uchar16(val));
+#elif OUTPUT_TYPE_SIZE == 2
+# define OUTPUT_BLOCK_WRITE(ptr, val) intel_sub_group_block_write_us((__global ushort*)(ptr), as_ushort(val));
+# define OUTPUT_BLOCK_WRITE2(ptr, val) intel_sub_group_block_write_us2((__global ushort*)(ptr), as_ushort2(val));
+# define OUTPUT_BLOCK_WRITE4(ptr, val) intel_sub_group_block_write_us4((__global ushort*)(ptr), as_ushort4(val));
+# define OUTPUT_BLOCK_WRITE8(ptr, val) intel_sub_group_block_write_us8((__global ushort*)(ptr), as_ushort8(val));
+# define OUTPUT_BLOCK_WRITE16(ptr, val) \
+ OUTPUT_BLOCK_WRITE8(ptr, (val).lo) \
+ OUTPUT_BLOCK_WRITE8((__global ushort*)(ptr) + 8 * get_max_sub_group_size(), (val).hi)
+#elif OUTPUT_TYPE_SIZE == 4
+# define OUTPUT_BLOCK_WRITE(ptr, val) intel_sub_group_block_write((__global uint*)(ptr), as_uint(val));
+# define OUTPUT_BLOCK_WRITE2(ptr, val) intel_sub_group_block_write2((__global uint*)(ptr), as_uint2(val));
+# define OUTPUT_BLOCK_WRITE4(ptr, val) intel_sub_group_block_write4((__global uint*)(ptr), as_uint4(val));
+# define OUTPUT_BLOCK_WRITE8(ptr, val) intel_sub_group_block_write8((__global uint*)(ptr), as_uint8(val));
+# define OUTPUT_BLOCK_WRITE16(ptr, val) \
+ OUTPUT_BLOCK_WRITE8(ptr, (val).lo) \
+ OUTPUT_BLOCK_WRITE8((__global uint*)(ptr) + 8 * get_max_sub_group_size(), (val).hi)
+#else
+# error convolution_gpu_b_fs_yx_fsv_16_32_imad_dw.cl - unsupported output type.
+#endif
+
+#define VEC_TO_ARRAY_2(arr, vec, offset) \
+ (arr)[(offset) + 0] = (vec).s0; \
+ (arr)[(offset) + 1] = (vec).s1
+#define VEC_TO_ARRAY_4(arr, vec, offset) \
+ VEC_TO_ARRAY_2(arr, (vec).s01, offset); \
+ VEC_TO_ARRAY_2(arr, (vec).s23, (offset) + 2)
+#define VEC_TO_ARRAY_8(arr, vec, offset) \
+ VEC_TO_ARRAY_4(arr, (vec).s0123, offset); \
+ VEC_TO_ARRAY_4(arr, (vec).s4567, (offset) + 4)
+#define VEC_TO_ARRAY_16(arr, vec, offset) \
+ VEC_TO_ARRAY_8(arr, (vec).s01234567, offset); \
+ VEC_TO_ARRAY_8(arr, (vec).s89abcdef, (offset) + 8)
+
+#define ARRAY_TO_VEC_2(vec, arr, offset) \
+ (vec).s0 = (arr)[(offset)]; \
+ (vec).s1 = (arr)[(offset) + 1]
+
+#define ARRAY_TO_VEC_4(vec, arr, offset) \
+ ARRAY_TO_VEC_2((vec).s01, arr, offset); \
+ ARRAY_TO_VEC_2((vec).s23, arr, (offset) + 2)
+
+#define ARRAY_TO_VEC_8(vec, arr, offset) \
+ ARRAY_TO_VEC_4((vec).s0123, arr, offset); \
+ ARRAY_TO_VEC_4((vec).s4567, arr, (offset) + 4)
+
+#define ARRAY_TO_VEC_16(vec, arr, offset) \
+ ARRAY_TO_VEC_8((vec).s01234567, arr, offset); \
+ ARRAY_TO_VEC_8((vec).s89abcdef, arr, (offset) + 8)
+
+#if FILTER_BLOCKED % 4 != 0
+# error convolution_gpu_b_fs_yx_fsv_16_32_imad_dw.cl - FILTER_BLOCKED must be multiple of 4.
+#endif
+
+#ifndef OUTPUT_PAD_VALUE
+# define OUTPUT_PAD_VALUE (OUTPUT_TYPE)(0)
+# define OUTPUT_PAD_VALUE_undef
+#endif
+
+__attribute__((intel_reqd_sub_group_size(SIMD)))
+__attribute__((reqd_work_group_size(LWS0, LWS1, SIMD)))
+KERNEL(convolution)(
+ const __global INPUT0_TYPE *input,
+ __global OUTPUT_TYPE *output,
+ const __global FILTER_TYPE *weights,
+#if BIAS_TERM
+ const __global BIAS_TYPE *biases,
+#endif
+#if ASYMMETRIC_WEIGHTS_QUANTIZATION
+ const __global WEIGHTS_ZERO_POINTS_TYPE *weights_zp,
+#endif
+#if ASYMMETRIC_DATA_QUANTIZATION
+ const __global ACTIVATIONS_ZERO_POINTS_TYPE *activations_zp,
+#endif
+#if COMPENSATION_TERM
+ const __global COMPENSATION_TYPE *compensation,
+#endif
+#if HAS_FUSED_OPS_DECLS
+ FUSED_OPS_DECLS,
+#endif
+ uint split_idx
+) {
+ uint x = get_global_id(0) * TILE_X;
+ uint y = get_global_id(1);
+ uint bf = get_group_id(2);
+ uint b = bf % OUTPUT_BATCH_NUM;
+ uint f = bf / OUTPUT_BATCH_NUM * FSV;
+
+ uint input_offset = GET_INPUT_INDEX(b, f, (int)y * STRIDE_SIZE_Y - PADDING_SIZE_Y, (int)x * STRIDE_SIZE_X - PADDING_SIZE_X);
+ uint weights_offset = GET_WEIGHTS_INDEX(f, 0, 0, 0, 0);
+
+ int acc[TILE_X * F_PER_WI] = { };
+#if ASYMMETRIC_WEIGHTS_QUANTIZATION
+ int src_sum[TILE_X * F_PER_WI] = { };
+#endif
+
+ __attribute__((opencl_unroll_hint))
+ for (uint fi = 0; fi < FILTER_BLOCKED / 4 * 4; fi += 4) {
+ // Loop over 4 filter spatials that match imad case
+ uint4 fis = (uint4)(fi, fi + 1, fi + 2, fi + 3);
+
+ uint4 fx = fis % FILTER_SIZE_X;
+ uint4 fy = fis / FILTER_SIZE_X;
+
+ // Input loading:
+ INPUT_TYPE in_trans0[TILE_X * F_PER_WI];
+ INPUT_TYPE in_trans1[TILE_X * F_PER_WI];
+ INPUT_TYPE in_trans2[TILE_X * F_PER_WI];
+ INPUT_TYPE in_trans3[TILE_X * F_PER_WI];
+#if STRIDE_SIZE_X == 1
+ // Without strides block reads can be used to load whole TILE_X inputs
+ // Block read ladder to select optimal combination of block reads for TILE_X
+ uint4 input_x_offset = fx * (DILATION_SIZE_X * INPUT_X_PITCH);
+ uint4 input_y_offset = fy * (DILATION_SIZE_Y * INPUT_Y_PITCH);
+ uint4 input_spatial_offset = input_x_offset + input_y_offset;
+ uint4 input_idx = input_spatial_offset + input_offset;
+
+ uint tx = 0;
+ __attribute__((opencl_unroll_hint))
+ for (; tx + 16 <= TILE_X * F_PER_WI; tx += 16) {
+ INPUT_TYPE16 tmp_in0 = AS_INPUT_TYPE16(BLOCK_READ_UC_16((const __global uchar*)(input + input_idx.s0)));
+ INPUT_TYPE16 tmp_in1 = AS_INPUT_TYPE16(BLOCK_READ_UC_16((const __global uchar*)(input + input_idx.s1)));
+ INPUT_TYPE16 tmp_in2 = AS_INPUT_TYPE16(BLOCK_READ_UC_16((const __global uchar*)(input + input_idx.s2)));
+ INPUT_TYPE16 tmp_in3 = AS_INPUT_TYPE16(BLOCK_READ_UC_16((const __global uchar*)(input + input_idx.s3)));
+
+ VEC_TO_ARRAY_16(in_trans0, tmp_in0, tx);
+ VEC_TO_ARRAY_16(in_trans1, tmp_in1, tx);
+ VEC_TO_ARRAY_16(in_trans2, tmp_in2, tx);
+ VEC_TO_ARRAY_16(in_trans3, tmp_in3, tx);
+
+ input_idx += 16 * SIMD;
+ }
+ if (TILE_X * F_PER_WI % 16 >= 8) {
+ INPUT_TYPE8 tmp_in0 = AS_INPUT_TYPE8(BLOCK_READ_UC_8((const __global uchar*)(input + input_idx.s0)));
+ INPUT_TYPE8 tmp_in1 = AS_INPUT_TYPE8(BLOCK_READ_UC_8((const __global uchar*)(input + input_idx.s1)));
+ INPUT_TYPE8 tmp_in2 = AS_INPUT_TYPE8(BLOCK_READ_UC_8((const __global uchar*)(input + input_idx.s2)));
+ INPUT_TYPE8 tmp_in3 = AS_INPUT_TYPE8(BLOCK_READ_UC_8((const __global uchar*)(input + input_idx.s3)));
+
+ VEC_TO_ARRAY_8(in_trans0, tmp_in0, tx);
+ VEC_TO_ARRAY_8(in_trans1, tmp_in1, tx);
+ VEC_TO_ARRAY_8(in_trans2, tmp_in2, tx);
+ VEC_TO_ARRAY_8(in_trans3, tmp_in3, tx);
+
+ input_idx += 8 * SIMD;
+ tx += 8;
+ }
+ if (TILE_X * F_PER_WI % 8 >= 4) {
+ INPUT_TYPE4 tmp_in0 = AS_INPUT_TYPE4(BLOCK_READ_UC_4((const __global uchar*)(input + input_idx.s0)));
+ INPUT_TYPE4 tmp_in1 = AS_INPUT_TYPE4(BLOCK_READ_UC_4((const __global uchar*)(input + input_idx.s1)));
+ INPUT_TYPE4 tmp_in2 = AS_INPUT_TYPE4(BLOCK_READ_UC_4((const __global uchar*)(input + input_idx.s2)));
+ INPUT_TYPE4 tmp_in3 = AS_INPUT_TYPE4(BLOCK_READ_UC_4((const __global uchar*)(input + input_idx.s3)));
+
+ VEC_TO_ARRAY_4(in_trans0, tmp_in0, tx);
+ VEC_TO_ARRAY_4(in_trans1, tmp_in1, tx);
+ VEC_TO_ARRAY_4(in_trans2, tmp_in2, tx);
+ VEC_TO_ARRAY_4(in_trans3, tmp_in3, tx);
+
+ input_idx += 4 * SIMD;
+ tx += 4;
+ }
+ if (TILE_X * F_PER_WI % 4 >= 2) {
+ INPUT_TYPE2 tmp_in0 = AS_INPUT_TYPE2(BLOCK_READ_UC_2((const __global uchar*)(input + input_idx.s0)));
+ INPUT_TYPE2 tmp_in1 = AS_INPUT_TYPE2(BLOCK_READ_UC_2((const __global uchar*)(input + input_idx.s1)));
+ INPUT_TYPE2 tmp_in2 = AS_INPUT_TYPE2(BLOCK_READ_UC_2((const __global uchar*)(input + input_idx.s2)));
+ INPUT_TYPE2 tmp_in3 = AS_INPUT_TYPE2(BLOCK_READ_UC_2((const __global uchar*)(input + input_idx.s3)));
+
+ VEC_TO_ARRAY_2(in_trans0, tmp_in0, tx);
+ VEC_TO_ARRAY_2(in_trans1, tmp_in1, tx);
+ VEC_TO_ARRAY_2(in_trans2, tmp_in2, tx);
+ VEC_TO_ARRAY_2(in_trans3, tmp_in3, tx);
+
+ input_idx += 2 * SIMD;
+ tx += 2;
+ }
+ if (TILE_X * F_PER_WI % 2 == 1) {
+ in_trans0[tx] = AS_INPUT_TYPE(BLOCK_READ_UC_1((const __global uchar*)(input + input_idx.s0)));
+ in_trans1[tx] = AS_INPUT_TYPE(BLOCK_READ_UC_1((const __global uchar*)(input + input_idx.s1)));
+ in_trans2[tx] = AS_INPUT_TYPE(BLOCK_READ_UC_1((const __global uchar*)(input + input_idx.s2)));
+ in_trans3[tx] = AS_INPUT_TYPE(BLOCK_READ_UC_1((const __global uchar*)(input + input_idx.s3)));
+ }
+#else
+ uint4 input_x_offset = fx * DILATION_SIZE_X * INPUT_X_PITCH;
+ uint4 input_y_offset = fy * DILATION_SIZE_Y * INPUT_Y_PITCH;
+ uint4 input_spatial_offset = input_x_offset + input_y_offset;
+ uint4 input_start_offset = input_spatial_offset + input_offset;
+ __attribute__((opencl_unroll_hint))
+ for (uint tx = 0; tx < TILE_X; ++tx) {
+ uint4 input_idx = input_start_offset + tx * STRIDE_SIZE_X * INPUT_X_PITCH;
+ // Block reads along feature slice
+ uint fw = 0;
+ __attribute__((opencl_unroll_hint))
+ for (; fw + 4 <= F_PER_WI; fw += 4) {
+ INPUT_TYPE4 tmp_in0 = AS_INPUT_TYPE4(BLOCK_READ_UC_4((const __global uchar*)(input + input_idx.s0)));
+ INPUT_TYPE4 tmp_in1 = AS_INPUT_TYPE4(BLOCK_READ_UC_4((const __global uchar*)(input + input_idx.s1)));
+ INPUT_TYPE4 tmp_in2 = AS_INPUT_TYPE4(BLOCK_READ_UC_4((const __global uchar*)(input + input_idx.s2)));
+ INPUT_TYPE4 tmp_in3 = AS_INPUT_TYPE4(BLOCK_READ_UC_4((const __global uchar*)(input + input_idx.s3)));
+
+ VEC_TO_ARRAY_4(in_trans0, tmp_in0, tx * F_PER_WI + fw);
+ VEC_TO_ARRAY_4(in_trans1, tmp_in1, tx * F_PER_WI + fw);
+ VEC_TO_ARRAY_4(in_trans2, tmp_in2, tx * F_PER_WI + fw);
+ VEC_TO_ARRAY_4(in_trans3, tmp_in3, tx * F_PER_WI + fw);
+
+ input_idx += 4 * SIMD;
+ }
+ if (F_PER_WI % 4 >= 2) {
+ INPUT_TYPE2 tmp_in0 = AS_INPUT_TYPE2(BLOCK_READ_UC_2((const __global uchar*)(input + input_idx.s0)));
+ INPUT_TYPE2 tmp_in1 = AS_INPUT_TYPE2(BLOCK_READ_UC_2((const __global uchar*)(input + input_idx.s1)));
+ INPUT_TYPE2 tmp_in2 = AS_INPUT_TYPE2(BLOCK_READ_UC_2((const __global uchar*)(input + input_idx.s2)));
+ INPUT_TYPE2 tmp_in3 = AS_INPUT_TYPE2(BLOCK_READ_UC_2((const __global uchar*)(input + input_idx.s3)));
+
+ VEC_TO_ARRAY_2(in_trans0, tmp_in0, tx * F_PER_WI + fw);
+ VEC_TO_ARRAY_2(in_trans1, tmp_in1, tx * F_PER_WI + fw);
+ VEC_TO_ARRAY_2(in_trans2, tmp_in2, tx * F_PER_WI + fw);
+ VEC_TO_ARRAY_2(in_trans3, tmp_in3, tx * F_PER_WI + fw);
+
+ input_idx += 2 * SIMD;
+ fw += 2;
+ }
+ if (F_PER_WI % 2 == 1) {
+ in_trans0[tx * F_PER_WI + fw] = AS_INPUT_TYPE(BLOCK_READ_UC_1((const __global uchar*)(input + input_idx.s0)));
+ in_trans1[tx * F_PER_WI + fw] = AS_INPUT_TYPE(BLOCK_READ_UC_1((const __global uchar*)(input + input_idx.s1)));
+ in_trans2[tx * F_PER_WI + fw] = AS_INPUT_TYPE(BLOCK_READ_UC_1((const __global uchar*)(input + input_idx.s2)));
+ in_trans3[tx * F_PER_WI + fw] = AS_INPUT_TYPE(BLOCK_READ_UC_1((const __global uchar*)(input + input_idx.s3)));
+ }
+ }
+#endif
+ // Weights loading:
+ FILTER_TYPE4 wei[F_PER_WI];
+ __attribute__((opencl_unroll_hint))
+ for (uint fw = 0; fw < F_PER_WI; ++fw) {
+ wei[fw] = AS_FILTER_TYPE4(intel_sub_group_block_read((const __global uint*)(weights + weights_offset) + fw * SIMD));
+ }
+
+ // Transpose input:
+ INPUT_TYPE4 in[TILE_X * F_PER_WI];
+ __attribute__((opencl_unroll_hint))
+ for (uint tx = 0; tx < TILE_X; ++tx) {
+ __attribute__((opencl_unroll_hint))
+ for (uint fw = 0; fw < F_PER_WI; ++fw) {
+ uint in_offset = tx * F_PER_WI + fw;
+ in[in_offset] = (INPUT_TYPE4)(in_trans0[in_offset], in_trans1[in_offset], in_trans2[in_offset], in_trans3[in_offset]);
+ }
+ }
+
+ // IMAD:
+ __attribute__((opencl_unroll_hint))
+ for (uint tx = 0; tx < TILE_X; ++tx) {
+ __attribute__((opencl_unroll_hint))
+ for (uint fw = 0; fw < F_PER_WI; ++fw) {
+ acc[tx * F_PER_WI + fw] = IMAD(acc[tx * F_PER_WI + fw], in[tx * F_PER_WI + fw], wei[fw]);
+ }
+ }
+
+#if ASYMMETRIC_WEIGHTS_QUANTIZATION
+ // Accumulate for input values for asymmetric weights:
+ __attribute__((opencl_unroll_hint))
+ for (uint tx = 0; tx < TILE_X; ++tx) {
+ __attribute__((opencl_unroll_hint))
+ for (uint fw = 0; fw < F_PER_WI; ++fw) {
+ src_sum[tx * F_PER_WI + fw] = IMAD(src_sum[tx * F_PER_WI + fw], in[tx * F_PER_WI + fw], (char4)(1, 1, 1, 1));
+ }
+ }
+#endif
+
+ weights_offset += WEIGHTS_YXS_PITCH;
+ }
+
+
+#if FILTER_BLOCKED < FILTER_SPATIAL_SIZE
+ // Leftovers in filters spatial - use raw multiplication instead of imad
+ // Load inputs before loop to avoid byte scattered reads + there are at most 3 leftovers
+ FILTER_TYPE4 wei[F_PER_WI];
+ __attribute__((opencl_unroll_hint))
+ for (uint fw = 0; fw < F_PER_WI; ++fw) {
+ wei[fw] = AS_FILTER_TYPE4(intel_sub_group_block_read((const __global uint*)(weights + weights_offset) + fw * SIMD));
+ }
+
+ __attribute__((opencl_unroll_hint))
+ for (uint fi = 0; fi < FILTER_SPATIAL_SIZE - FILTER_BLOCKED; ++fi) {
+ // Input loading:
+ uint fx = (fi + FILTER_BLOCKED) % FILTER_SIZE_X;
+ uint fy = (fi + FILTER_BLOCKED) / FILTER_SIZE_X;
+
+ INPUT_TYPE in_trans0[TILE_X * F_PER_WI];
+# if STRIDE_SIZE_X == 1
+ uint input_x_offset = fx * (DILATION_SIZE_X * INPUT_X_PITCH);
+ uint input_y_offset = fy * (DILATION_SIZE_Y * INPUT_Y_PITCH);
+ uint input_spatial_offset = input_x_offset + input_y_offset;
+ uint input_idx = input_spatial_offset + input_offset;
+
+ uint tx = 0;
+ __attribute__((opencl_unroll_hint))
+ for (; tx + 16 <= TILE_X * F_PER_WI; tx += 16) {
+ INPUT_TYPE16 tmp_in0 = AS_INPUT_TYPE16(BLOCK_READ_UC_16((const __global uchar*)(input + input_idx)));
+ VEC_TO_ARRAY_16(in_trans0, tmp_in0, tx);
+ input_idx += 16 * SIMD;
+ }
+ if (TILE_X * F_PER_WI % 16 >= 8) {
+ INPUT_TYPE8 tmp_in0 = AS_INPUT_TYPE8(BLOCK_READ_UC_8((const __global uchar*)(input + input_idx)));
+ VEC_TO_ARRAY_8(in_trans0, tmp_in0, tx);
+ input_idx += 8 * SIMD;
+ tx += 8;
+ }
+ if (TILE_X * F_PER_WI % 8 >= 4) {
+ INPUT_TYPE4 tmp_in0 = AS_INPUT_TYPE4(BLOCK_READ_UC_4((const __global uchar*)(input + input_idx)));
+ VEC_TO_ARRAY_4(in_trans0, tmp_in0, tx);
+ input_idx += 4 * SIMD;
+ tx += 4;
+ }
+ if (TILE_X * F_PER_WI % 4 >= 2) {
+ INPUT_TYPE2 tmp_in0 = AS_INPUT_TYPE2(BLOCK_READ_UC_2((const __global uchar*)(input + input_idx)));
+ VEC_TO_ARRAY_2(in_trans0, tmp_in0, tx);
+ input_idx += 2 * SIMD;
+ tx += 2;
+ }
+ if (TILE_X * F_PER_WI % 2 == 1) {
+ in_trans0[tx] = AS_INPUT_TYPE(BLOCK_READ_UC_1((const __global uchar*)(input + input_idx)));
+ }
+# else
+ uint input_x_offset = fx * DILATION_SIZE_X * INPUT_X_PITCH;
+ uint input_y_offset = fy * DILATION_SIZE_Y * INPUT_Y_PITCH;
+ uint input_spatial_offset = input_x_offset + input_y_offset;
+ uint input_start_offset = input_spatial_offset + input_offset;
+ __attribute__((opencl_unroll_hint))
+ for (uint tx = 0; tx < TILE_X; ++tx) {
+ uint input_idx = input_start_offset + tx * STRIDE_SIZE_X * INPUT_X_PITCH;
+ uint fw = 0;
+ __attribute__((opencl_unroll_hint))
+ for (; fw + 4 <= F_PER_WI; fw += 4) {
+ INPUT_TYPE4 tmp_in0 = AS_INPUT_TYPE4(BLOCK_READ_UC_4((const __global uchar*)(input + input_idx)));
+ VEC_TO_ARRAY_4(in_trans0, tmp_in0, tx * F_PER_WI + fw);
+ input_idx += 4 * SIMD;
+ }
+ if (F_PER_WI % 4 >= 2) {
+ INPUT_TYPE2 tmp_in0 = AS_INPUT_TYPE2(BLOCK_READ_UC_2((const __global uchar*)(input + input_idx)));
+ VEC_TO_ARRAY_2(in_trans0, tmp_in0, tx * F_PER_WI + fw);
+ input_idx += 2 * SIMD;
+ fw += 2;
+ }
+ if (F_PER_WI % 2 == 1) {
+ in_trans0[tx * F_PER_WI + fw] = AS_INPUT_TYPE(BLOCK_READ_UC_1((const __global uchar*)(input + input_idx)));
+ }
+ }
+# endif
+ // Raw multiply accumulate:
+ __attribute__((opencl_unroll_hint))
+ for (uint tx = 0; tx < TILE_X; ++tx) {
+ __attribute__((opencl_unroll_hint))
+ for (uint fw = 0; fw < F_PER_WI; ++fw) {
+ acc[tx * F_PER_WI + fw] += (int)in_trans0[tx * F_PER_WI + fw] * (int)wei[fw][fi];
+ }
+ }
+
+#if ASYMMETRIC_WEIGHTS_QUANTIZATION
+ // Accumulate input values for asymmetric weights:
+ __attribute__((opencl_unroll_hint))
+ for (uint tx = 0; tx < TILE_X; ++tx) {
+ __attribute__((opencl_unroll_hint))
+ for (uint fw = 0; fw < F_PER_WI; ++fw) {
+ src_sum[tx * F_PER_WI + fw] += (int)in_trans0[tx * F_PER_WI + fw];
+ }
+ }
+#endif
+ }
+#endif
+
+ DEQUANTIZED_TYPE dequantized[TILE_X * F_PER_WI];
+ for (uint tx = 0; tx < TILE_X * F_PER_WI; ++tx) {
+ dequantized[tx] = TO_DEQUANTIZED_TYPE(acc[tx]);
+ }
+
+#if BIAS_TERM
+# if BIAS_PER_OFM
+ __attribute__((opencl_unroll_hint))
+ for (uint fw = 0; fw < F_PER_WI; ++fw) {
+ uint bias_offset = f + fw * SIMD + get_sub_group_local_id();
+ BIAS_TYPE bias = biases[bias_offset];
+ __attribute__((opencl_unroll_hint))
+ for (uint tx = 0; tx < TILE_X; ++tx) {
+ dequantized[tx * F_PER_WI + fw] += TO_DEQUANTIZED_TYPE(bias);
+ }
+ }
+# elif BIAS_PER_OUTPUT
+ __attribute__((opencl_unroll_hint))
+ for (uint tx = 0; tx < TILE_X; ++tx) {
+ __attribute__((opencl_unroll_hint))
+ for (uint fw = 0; fw < F_PER_WI; ++fw) {
+ uint bias_offset = GET_BIAS_INDEX(b, f + fw * SIMD + get_sub_group_local_id(), y, x + tx);
+ BIAS_TYPE bias = biases[bias_offset];
+ dequantized[tx * F_PER_WI + fw] += TO_DEQUANTIZED_TYPE(bias);
+ }
+ }
+# else
+# error convolution_gpu_b_fs_yx_fsv_16_32_imad_dw.cl - unsupported bias mode.
+# endif
+#endif
+
+#if ASYMMETRIC_WEIGHTS_QUANTIZATION
+ {
+ __attribute__((opencl_unroll_hint))
+ for (uint fw = 0; fw < F_PER_WI; ++fw) {
+ WEIGHTS_ZERO_POINTS_TYPE wzp = weights_zp[f + fw * SIMD + get_sub_group_local_id()];
+ __attribute__((opencl_unroll_hint))
+ for (uint tx = 0; tx < TILE_X; ++tx) {
+ dequantized[tx * F_PER_WI + fw] -= TO_DEQUANTIZED_TYPE(src_sum[tx * F_PER_WI + fw]) * TO_DEQUANTIZED_TYPE(wzp);
+ }
+ }
+ }
+#endif
+
+#if COMPENSATION_TERM
+ {
+ __attribute__((opencl_unroll_hint))
+ for (uint fw = 0; fw < F_PER_WI; ++fw) {
+ COMPENSATION_TYPE comp = compensation[f + fw * SIMD + get_sub_group_local_id()];
+ __attribute__((opencl_unroll_hint))
+ for (uint tx = 0; tx < TILE_X; ++tx) {
+ dequantized[tx * F_PER_WI + fw] += TO_DEQUANTIZED_TYPE(comp);
+ }
+ }
+ }
+#endif
+
+ OUTPUT_TYPE out[TILE_X * F_PER_WI];
+ // Fused ops and conversion to output type
+ __attribute__((opencl_unroll_hint))
+ for (uint tx = 0; tx < TILE_X; ++tx) {
+#if HAS_FUSED_OPS
+ uint fused_ops_x = x + tx;
+ uint fused_ops_f = f;
+ uint fw = 0;
+ __attribute__((opencl_unroll_hint))
+ for (; fw + 4 <= F_PER_WI; fw += 4) {
+ DEQUANTIZED_TYPE4 fused_ops_in;
+ ARRAY_TO_VEC_4(fused_ops_in, dequantized, tx * F_PER_WI + fw);
+ FUSED_OPS_4;
+ VEC_TO_ARRAY_4(out, FUSED_OPS_RESULT_4, tx * F_PER_WI + fw);
+ fused_ops_f += 4 * SIMD;
+ }
+ if (F_PER_WI % 4 >= 2) {
+ DEQUANTIZED_TYPE2 fused_ops_in;
+ ARRAY_TO_VEC_2(fused_ops_in, dequantized, tx * F_PER_WI + fw);
+ FUSED_OPS_2;
+ VEC_TO_ARRAY_2(out, FUSED_OPS_RESULT_2, tx * F_PER_WI + fw);
+ fw += 2;
+ fused_ops_f += 2 * SIMD;
+ }
+ if (F_PER_WI % 2 == 1) {
+ DEQUANTIZED_TYPE fused_ops_in;
+ fused_ops_in = dequantized[tx * F_PER_WI + fw];
+ FUSED_OPS_1;
+ out[tx * F_PER_WI + fw] = FUSED_OPS_RESULT_1;
+ }
+#else
+ __attribute__((opencl_unroll_hint))
+ for (uint fw = 0; fw < F_PER_WI; ++fw) {
+ out[tx * F_PER_WI + fw] = TO_OUTPUT_TYPE(dequantized[tx * F_PER_WI + fw]);
+ }
+#endif
+ }
+
+ // Fill results outside output in features with OUTPUT_PAD_VALUE.
+ if (OUTPUT_FEATURE_NUM % FSV != 0 && f + FSV > OUTPUT_FEATURE_NUM) {
+ __attribute__((opencl_unroll_hint))
+ for (uint tx = 0; tx < TILE_X; ++tx) {
+ __attribute__((opencl_unroll_hint))
+ for (uint fw = 0; fw < F_PER_WI; ++fw) {
+ bool outside = fw * SIMD + get_sub_group_local_id() >= OUTPUT_FEATURE_NUM % FSV;
+ out[tx * F_PER_WI + fw] = outside ? OUTPUT_PAD_VALUE : out[tx * F_PER_WI + fw];
+ }
+ }
+ }
+
+ uint output_offset = GET_OUTPUT_INDEX(b, f, y, x);
+
+ if (OUTPUT_SIZE_X % TILE_X == 0 || x + TILE_X <= OUTPUT_SIZE_X) {
+ // Full output tile x write using block write ladder
+ uint tx = 0;
+ __attribute__((opencl_unroll_hint))
+ for (; tx + 16 <= TILE_X * F_PER_WI; tx += 16) {
+ OUTPUT_TYPE16 tmp_write;
+ ARRAY_TO_VEC_16(tmp_write, out, tx);
+ OUTPUT_BLOCK_WRITE16(output + output_offset, tmp_write);
+ output_offset += 16 * SIMD;
+ }
+ if (TILE_X * F_PER_WI % 16 >= 8) {
+ OUTPUT_TYPE8 tmp_write;
+ ARRAY_TO_VEC_8(tmp_write, out, tx);
+ OUTPUT_BLOCK_WRITE8(output + output_offset, tmp_write);
+ tx += 8;
+ output_offset += 8 * SIMD;
+ }
+ if (TILE_X * F_PER_WI % 8 >= 4) {
+ OUTPUT_TYPE4 tmp_write;
+ ARRAY_TO_VEC_4(tmp_write, out, tx);
+ OUTPUT_BLOCK_WRITE4(output + output_offset, tmp_write);
+ tx += 4;
+ output_offset += 4 * SIMD;
+ }
+ if (TILE_X * F_PER_WI % 4 >= 2) {
+ OUTPUT_TYPE2 tmp_write;
+ ARRAY_TO_VEC_2(tmp_write, out, tx);
+ OUTPUT_BLOCK_WRITE2(output + output_offset, tmp_write);
+ tx += 2;
+ output_offset += 2 * SIMD;
+ }
+ if (TILE_X * F_PER_WI % 2 == 1) {
+ OUTPUT_BLOCK_WRITE(output + output_offset, out[tx]);
+ }
+ } else {
+ // Leftovers write, block writes in f dimension only
+ __attribute__((opencl_unroll_hint))
+ for (uint tx = 0; tx < TILE_X; ++tx) {
+ if (tx < OUTPUT_SIZE_X % TILE_X) {
+ uint fw = 0;
+ __attribute__((opencl_unroll_hint))
+ for (; fw + 4 <= F_PER_WI; fw += 4) {
+ OUTPUT_TYPE4 tmp_write;
+ ARRAY_TO_VEC_4(tmp_write, out, tx * F_PER_WI + fw);
+ OUTPUT_BLOCK_WRITE4(output + output_offset + fw * SIMD, tmp_write);
+ }
+ if (F_PER_WI % 4 >= 2) {
+ OUTPUT_TYPE2 tmp_write;
+ ARRAY_TO_VEC_2(tmp_write, out, tx * F_PER_WI + fw);
+ OUTPUT_BLOCK_WRITE2(output + output_offset + fw * SIMD, tmp_write);
+ fw += 2;
+ }
+ if (F_PER_WI % 2 == 1) {
+ OUTPUT_BLOCK_WRITE(output + output_offset + fw * SIMD, out[tx * F_PER_WI + fw]);
+ }
+ }
+ output_offset += FSV;
+ }
+ }
+}
+
+#undef FSV
+
+#undef F_PER_WI
+
+#undef DEQUANTIZED_TYPE
+#undef DEQUANTIZED_TYPE2
+#undef DEQUANTIZED_TYPE4
+
+#undef INPUT_TYPE
+#undef INPUT_TYPE2
+#undef INPUT_TYPE4
+#undef INPUT_TYPE8
+#undef INPUT_TYPE16
+
+#undef FILTER_TYPE4
+
+#undef OUTPUT_TYPE2
+#undef OUTPUT_TYPE4
+#undef OUTPUT_TYPE8
+#undef OUTPUT_TYPE16
+
+#undef AS_INPUT_TYPE
+#undef AS_INPUT_TYPE2
+#undef AS_INPUT_TYPE4
+#undef AS_INPUT_TYPE8
+#undef AS_INPUT_TYPE16
+
+#undef AS_FILTER_TYPE
+
+#undef TO_DEQUANTIZED_TYPE
+
+#undef GET_INPUT_INDEX
+#undef GET_WEIGHTS_INDEX
+#undef GET_OUTPUT_INDEX
+
+#undef INPUT_X_PITCH
+#undef INPUT_Y_PITCH
+
+#undef WEIGHTS_YXS_PITCH
+
+#undef FILTER_SPATIAL_SIZE
+
+#undef OUTPUT_BLOCK_WRITE
+#undef OUTPUT_BLOCK_WRITE2
+#undef OUTPUT_BLOCK_WRITE4
+#undef OUTPUT_BLOCK_WRITE8
+#undef OUTPUT_BLOCK_WRITE16
+
+#undef VEC_TO_ARRAY_2
+#undef VEC_TO_ARRAY_4
+#undef VEC_TO_ARRAY_8
+#undef VEC_TO_ARRAY_16
+
+#undef ARRAY_TO_VEC_2
+#undef ARRAY_TO_VEC_4
+#undef ARRAY_TO_VEC_8
+#undef ARRAY_TO_VEC_16
+
+#ifdef OUTPUT_PAD_VALUE_undef
+# undef OUTPUT_PAD_VALUE
+# undef OUTPUT_PAD_VALUE_undef
+#endif
--- /dev/null
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/common.cl"
+#include "include/data_types.cl"
+#include "include/fetch.cl"
+
+__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
+KERNEL(convolution_gpu_bfyx_iyxo_5x5)(
+ const __global UNIT_TYPE* input,
+ __global UNIT_TYPE* output,
+ const __global UNIT_TYPE* weights,
+#if BIAS_TERM
+ const __global UNIT_TYPE* bias,
+#endif
+ uint split_idx)
+{
+ const uint idx = 4 * ((uint)get_global_id(0) * 16 + (uint)get_global_id(2));
+ const uint idy = (uint)get_global_id(1);
+ uint filter_idx = 0;
+ uint output_idx = 0;
+ uint input_idx = 0;
+ UNIT_TYPE inp[8] = { 0 };
+
+#if FILTER_OFM_NUM > 16
+#define FILTER_OFM_MAX 16
+#else
+#define FILTER_OFM_MAX FILTER_OFM_NUM
+#endif
+ __attribute__((opencl_unroll_hint(1)))
+ for (int iter = 0; iter < FILTER_OFM_NUM / FILTER_OFM_MAX + (FILTER_OFM_NUM % FILTER_OFM_MAX != 0); iter++) {
+ UNIT_TYPE out1[FILTER_OFM_MAX] = { 0 };
+ UNIT_TYPE out2[FILTER_OFM_MAX] = { 0 };
+ UNIT_TYPE out3[FILTER_OFM_MAX] = { 0 };
+ UNIT_TYPE out4[FILTER_OFM_MAX] = { 0 };
+
+ filter_idx = FILTER_OFM_MAX * iter;
+
+ __attribute__((opencl_unroll_hint(FILTER_IFM_NUM)))
+ for (int ifm = 0; ifm < FILTER_IFM_NUM; ifm++) {
+ __attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
+ for (int yy = 0; yy < FILTER_SIZE_Y; yy++) {
+ uint inp_idx = ifm * (INPUT0_FEATURE_PITCH)+(idy + yy) * (INPUT0_Y_PITCH)+idx;
+ half8 tmp = as_half8(vload4(0, (__global uint*)(input + inp_idx)));
+
+ inp[0] = tmp.s0;
+ inp[1] = tmp.s1;
+ inp[2] = tmp.s2;
+ inp[3] = tmp.s3;
+ inp[4] = tmp.s4;
+ inp[5] = tmp.s5;
+ inp[6] = tmp.s6;
+ inp[7] = tmp.s7;
+
+ __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
+ for (int xx = 0; xx < FILTER_SIZE_X; xx++) {
+#if FILTER_OFM_NUM == 4
+ half4 w = as_half4(vload2(0, (__global uint*)(weights + filter_idx)));
+#elif FILTER_OFM_NUM == 8
+ half8 w = as_half8(vload4(0, (__global uint*)(weights + filter_idx)));
+#else
+ half16 w = as_half16(vload8(0, (__global uint*)(weights + filter_idx)));
+#endif
+ __attribute__((opencl_unroll_hint(FILTER_OFM_MAX)))
+ for (int ofm = 0; ofm < FILTER_OFM_MAX; ofm++) {
+ out1[ofm] = mad(inp[0 + xx], w[ofm], out1[ofm]);
+ out2[ofm] = mad(inp[1 + xx], w[ofm], out2[ofm]);
+ out3[ofm] = mad(inp[2 + xx], w[ofm], out3[ofm]);
+ out4[ofm] = mad(inp[3 + xx], w[ofm], out4[ofm]);
+ }
+ filter_idx += FILTER_OFM_NUM;
+ }
+ }
+ }
+
+ __attribute__((opencl_unroll_hint(FILTER_OFM_MAX)))
+ for (int ofm = 0; ofm < FILTER_OFM_MAX; ofm++) {
+#if BIAS_TERM
+ out1[ofm] += bias[(iter * FILTER_OFM_MAX) + ofm];
+ out2[ofm] += bias[(iter * FILTER_OFM_MAX) + ofm];
+ out3[ofm] += bias[(iter * FILTER_OFM_MAX) + ofm];
+ out4[ofm] += bias[(iter * FILTER_OFM_MAX) + ofm];
+#endif
+ out1[ofm] = ACTIVATION(out1[ofm], ACTIVATION_PARAMS);
+ out2[ofm] = ACTIVATION(out2[ofm], ACTIVATION_PARAMS);
+ out3[ofm] = ACTIVATION(out3[ofm], ACTIVATION_PARAMS);
+ out4[ofm] = ACTIVATION(out4[ofm], ACTIVATION_PARAMS);
+ output_idx = (iter * FILTER_OFM_MAX * OUTPUT_FEATURE_PITCH) + ofm * OUTPUT_FEATURE_PITCH +
+ idy * OUTPUT_Y_PITCH + idx;
+#if OUTPUT_OFFSET > 0
+#if (OUTPUT_OFFSET % 2) > 0
+ output[output_idx + OUTPUT_OFFSET + 0] = out1[ofm];
+ output[output_idx + OUTPUT_OFFSET + 1] = out2[ofm];
+ output[output_idx + OUTPUT_OFFSET + 2] = out3[ofm];
+ output[output_idx + OUTPUT_OFFSET + 3] = out4[ofm];
+#else
+ __global float* out_fl = output + output_idx + OUTPUT_OFFSET;
+ out_fl[0] = as_float((half2)(out1[ofm], out2[ofm]));
+ out_fl[1] = as_float((half2)(out3[ofm], out4[ofm]));
+#endif
+#else
+ vstore2((float2)(as_float((half2)(out1[ofm], out2[ofm])), as_float((half2)(out3[ofm], out4[ofm]))),
+ 0, (__global float*)(output + output_idx));
+#endif
+ }
+ }
+}
out[out_i] = UNIT_VAL_ZERO;
}
- uint input_offset = INPUT0_OFFSET_WITH_PADDING;
- input_offset += oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X;
+ uint input_offset = oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X;
input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * INPUT0_SIZE_X_WITH_PADDING;
input_offset += b * INPUT0_BATCH_PITCH;
in = input[input_idx];
#if ASYMMETRIC_DATA_QUANTIZATION
else
- in = activations_zp[k];
+ in = activations_zp[g*FILTER_IFM_NUM + k];
#endif
uint filter_idx = filter_offset + k*FILTER_IFM_PITCH + j*FILTER_Y_PITCH + i*FILTER_X_PITCH;
--- /dev/null
+// Copyright (c) 2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "include/include_all.cl"
+
+KERNEL(depth_to_space_block2_opt)(const __global half* input, __global half* output)
+{
+ const int in_height = get_global_size(1);
+ const int2 pos = { get_global_id(0), get_global_id(1) };
+
+ if (pos.x >= (IN_WIDTH) || pos.y >= in_height) return;
+
+ const int offset = IN_WIDTH * in_height;
+
+ __attribute__((opencl_unroll_hint(OUTPUT_FEATURE_NUM)))
+ for (uint ofm_id=0; ofm_id < OUTPUT_FEATURE_NUM; ofm_id++){
+ int add_off = offset * 2 * ofm_id * BLOCK_SIZE * BLOCK_SIZE;
+ int ofm_x_offset = offset * ofm_id;
+ const int inIdx = IN_WIDTH * pos.y + pos.x + ofm_x_offset;
+
+ half2 conv_out_0 = ACTIVATION(vload2(inIdx+(offset * 0 * OUTPUT_FEATURE_NUM), input ), ACTIVATION_PARAMS);
+ half2 conv_out_1 = ACTIVATION(vload2(inIdx+(offset * 1 * OUTPUT_FEATURE_NUM), input ), ACTIVATION_PARAMS);
+ half2 conv_out_2 = ACTIVATION(vload2(inIdx+(offset * 2 * OUTPUT_FEATURE_NUM), input ), ACTIVATION_PARAMS);
+ half2 conv_out_3 = ACTIVATION(vload2(inIdx+(offset * 3 * OUTPUT_FEATURE_NUM), input ), ACTIVATION_PARAMS);
+
+ int outIdx1 = IN_WIDTH * BLOCK_SIZE * pos.y + pos.x;
+ int outIdx2 = outIdx1 + IN_WIDTH;
+
+ vstore2((float2)(as_float((half2)(conv_out_0.s0, conv_out_1.s0)), as_float((half2)(conv_out_0.s1, conv_out_1.s1))), outIdx1, (__global float*) (output + add_off));
+ vstore2((float2)(as_float((half2)(conv_out_2.s0, conv_out_3.s0)), as_float((half2)(conv_out_2.s1, conv_out_3.s1))), outIdx2, (__global float*) (output + add_off));
+ }
+}
--- /dev/null
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/common.cl"
+#include "include/data_types.cl"
+#include "include/fetch.cl"
+
+__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
+KERNEL(fused_conv_eltwise_gpu_bfyx_iyxo)(
+ const __global UNIT_TYPE* input,
+#if OUTPUT_LAYOUT_IMAGE_2D_RGBA
+ write_only image2d_t output,
+#else
+ __global UNIT_TYPE* output,
+#endif
+ const __global UNIT_TYPE* weights,
+#if BIAS_TERM
+ const __global UNIT_TYPE* bias,
+#endif
+ uint split_idx,
+ const __global UNIT_TYPE* eltw_input)
+{
+ const uint idx = 4 * ((uint)get_global_id(0) * 16 + (uint)get_global_id(2));
+ const uint idy = (uint)get_global_id(1);
+ uint filter_idx = 0;
+ uint output_idx = 0;
+ uint output_idx_eltwise = 0;
+ uint input_idx = 0;
+ UNIT_TYPE inp[8] = { 0 };
+ const uint input0_pitch_Y = INPUT0_SIZE_X + 2 * (INPUT0_PAD_BEFORE_SIZE_X);
+ const uint input0_pitch_feature = input0_pitch_Y * (INPUT0_SIZE_Y + 2 * (INPUT0_PAD_BEFORE_SIZE_Y));
+
+#if FILTER_OFM_NUM > 16
+#define FILTER_OFM_MAX 16
+#else
+#define FILTER_OFM_MAX FILTER_OFM_NUM
+#endif
+ __attribute__((opencl_unroll_hint(1)))
+ for (int iter = 0; iter < FILTER_OFM_NUM / FILTER_OFM_MAX + (FILTER_OFM_NUM % FILTER_OFM_MAX != 0); iter++) {
+ UNIT_TYPE out1[FILTER_OFM_MAX] = { 0 };
+ UNIT_TYPE out2[FILTER_OFM_MAX] = { 0 };
+ UNIT_TYPE out3[FILTER_OFM_MAX] = { 0 };
+ UNIT_TYPE out4[FILTER_OFM_MAX] = { 0 };
+
+ filter_idx = FILTER_OFM_MAX * iter;
+
+ __attribute__((opencl_unroll_hint(FILTER_IFM_NUM)))
+ for (int ifm = 0; ifm < FILTER_IFM_NUM; ifm++) {
+ __attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
+ for (int yy = 0; yy < FILTER_SIZE_Y; yy++) {
+ uint inp_idx = ifm * input0_pitch_feature + (idy + yy) * input0_pitch_Y + idx;
+ half8 tmp = as_half8(vload4(0, (__global uint*)(input + inp_idx)));
+
+ inp[0] = tmp.s0;
+ inp[1] = tmp.s1;
+ inp[2] = tmp.s2;
+ inp[3] = tmp.s3;
+ inp[4] = tmp.s4;
+ inp[5] = tmp.s5;
+ inp[6] = tmp.s6;
+ inp[7] = tmp.s7;
+
+ __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
+ for (int xx = 0; xx < FILTER_SIZE_X; xx++) {
+#if FILTER_OFM_NUM == 4
+ half4 w = as_half4(vload2(0, (__global uint*)(weights + filter_idx)));
+#elif FILTER_OFM_NUM == 8
+ half8 w = as_half8(vload4(0, (__global uint*)(weights + filter_idx)));
+#else
+ half16 w = as_half16(vload8(0, (__global uint*)(weights + filter_idx)));
+#endif
+ __attribute__((opencl_unroll_hint(FILTER_OFM_MAX)))
+ for (int ofm = 0; ofm < FILTER_OFM_MAX; ofm++) {
+ out1[ofm] = mad(inp[0 + xx], w[ofm], out1[ofm]);
+ out2[ofm] = mad(inp[1 + xx], w[ofm], out2[ofm]);
+ out3[ofm] = mad(inp[2 + xx], w[ofm], out3[ofm]);
+ out4[ofm] = mad(inp[3 + xx], w[ofm], out4[ofm]);
+ }
+ filter_idx += FILTER_OFM_NUM;
+ }
+ }
+ }
+
+ __attribute__((opencl_unroll_hint(FILTER_OFM_MAX)))
+#if OUTPUT_LAYOUT_IMAGE_2D_RGBA
+ for (int ofm = 0; ofm < FILTER_OFM_MAX; ofm+=3) {
+#else
+ for (int ofm = 0; ofm < FILTER_OFM_MAX; ofm++) {
+#endif
+#if BIAS_TERM
+ out1[ofm] += bias[(iter * FILTER_OFM_MAX) + ofm];
+ out2[ofm] += bias[(iter * FILTER_OFM_MAX) + ofm];
+ out3[ofm] += bias[(iter * FILTER_OFM_MAX) + ofm];
+ out4[ofm] += bias[(iter * FILTER_OFM_MAX) + ofm];
+#if OUTPUT_LAYOUT_IMAGE_2D_RGBA
+ out1[ofm + 1] += bias[(iter * FILTER_OFM_MAX) + ofm + 1];
+ out2[ofm + 1] += bias[(iter * FILTER_OFM_MAX) + ofm + 1];
+ out3[ofm + 1] += bias[(iter * FILTER_OFM_MAX) + ofm + 1];
+ out4[ofm + 1] += bias[(iter * FILTER_OFM_MAX) + ofm + 1];
+
+ out1[ofm + 2] += bias[(iter * FILTER_OFM_MAX) + ofm + 2];
+ out2[ofm + 2] += bias[(iter * FILTER_OFM_MAX) + ofm + 2];
+ out3[ofm + 2] += bias[(iter * FILTER_OFM_MAX) + ofm + 2];
+ out4[ofm + 2] += bias[(iter * FILTER_OFM_MAX) + ofm + 2];
+#endif
+#endif
+ out1[ofm] = ACTIVATION(out1[ofm], ACTIVATION_PARAMS);
+ out2[ofm] = ACTIVATION(out2[ofm], ACTIVATION_PARAMS);
+ out3[ofm] = ACTIVATION(out3[ofm], ACTIVATION_PARAMS);
+ out4[ofm] = ACTIVATION(out4[ofm], ACTIVATION_PARAMS);
+#if OUTPUT_LAYOUT_IMAGE_2D_RGBA
+ out1[ofm + 1] = ACTIVATION(out1[ofm + 1], ACTIVATION_PARAMS);
+ out2[ofm + 1] = ACTIVATION(out2[ofm + 1], ACTIVATION_PARAMS);
+ out3[ofm + 1] = ACTIVATION(out3[ofm + 1], ACTIVATION_PARAMS);
+ out4[ofm + 1] = ACTIVATION(out4[ofm + 1], ACTIVATION_PARAMS);
+
+ out1[ofm + 2] = ACTIVATION(out1[ofm + 2], ACTIVATION_PARAMS);
+ out2[ofm + 2] = ACTIVATION(out2[ofm + 2], ACTIVATION_PARAMS);
+ out3[ofm + 2] = ACTIVATION(out3[ofm + 2], ACTIVATION_PARAMS);
+ out4[ofm + 2] = ACTIVATION(out4[ofm + 2], ACTIVATION_PARAMS);
+#endif
+ uint ofm_alignment = 4;
+ int idx_for_image = 0;
+ int idy_for_image = 0;
+
+ if (ofm / OUTPUT_FEATURE_NUM == 0) {
+ output_idx_eltwise = (iter * FILTER_OFM_MAX * OUTPUT_FEATURE_PITCH) + (ofm % OUTPUT_FEATURE_NUM) * OUTPUT_FEATURE_PITCH +
+ 2 * idy * OUTPUT_Y_PITCH + 2 * idx;
+ output_idx = (ofm % OUTPUT_FEATURE_NUM) + 2 * idy * OUTPUT_SIZE_X * ofm_alignment + 2 * idx * ofm_alignment;
+ idx_for_image = 2 * idx;
+ idy_for_image = 2 * idy;
+ }
+ else if (ofm / OUTPUT_FEATURE_NUM == 1) {
+ output_idx_eltwise = (iter * FILTER_OFM_MAX * OUTPUT_FEATURE_PITCH) + (ofm % OUTPUT_FEATURE_NUM) * OUTPUT_FEATURE_PITCH +
+ 2 * idy * OUTPUT_Y_PITCH + 2 * idx + 1;
+ output_idx = (ofm % OUTPUT_FEATURE_NUM) + 2 * idy * OUTPUT_SIZE_X * ofm_alignment + (2 * idx + 1) * ofm_alignment;
+ idx_for_image = 2 * idx + 1;
+ idy_for_image = 2 * idy;
+ }
+ else if (ofm / OUTPUT_FEATURE_NUM == 2) {
+ output_idx_eltwise = (iter * FILTER_OFM_MAX * OUTPUT_FEATURE_PITCH) + (ofm % OUTPUT_FEATURE_NUM) * OUTPUT_FEATURE_PITCH +
+ (2 * idy + 1) * OUTPUT_Y_PITCH + 2 * idx;
+ output_idx = (ofm % OUTPUT_FEATURE_NUM) + (2 * idy + 1) * OUTPUT_SIZE_X * ofm_alignment + 2 * idx * ofm_alignment;
+ idx_for_image = 2 * idx;
+ idy_for_image = 2 * idy + 1;
+ }
+ else if (ofm / OUTPUT_FEATURE_NUM == 3) {
+ output_idx_eltwise = (iter * FILTER_OFM_MAX * OUTPUT_FEATURE_PITCH) + (ofm % OUTPUT_FEATURE_NUM) * OUTPUT_FEATURE_PITCH +
+ (2 * idy + 1) * OUTPUT_Y_PITCH + 2 * idx + 1;
+ output_idx = (ofm % OUTPUT_FEATURE_NUM) + (2 * idy + 1) * OUTPUT_SIZE_X * ofm_alignment + (2 * idx + 1) * ofm_alignment;
+ idx_for_image = 2 * idx + 1;
+ idy_for_image = 2 * idy + 1;
+ }
+#if OUTPUT_LAYOUT_IMAGE_2D_RGBA
+ half4 output_half1 = {
+ out1[ofm + 0] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 0 + OUTPUT_FEATURE_PITCH * 0],
+ out1[ofm + 1] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 0 + OUTPUT_FEATURE_PITCH * 1],
+ out1[ofm + 2] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 0 + OUTPUT_FEATURE_PITCH * 2],
+ 0 };
+ IMAGE_WRITE(output, (int2)(idx_for_image, idy_for_image), output_half1);
+ half4 output_half2 = {
+ out2[ofm + 0] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 2 + OUTPUT_FEATURE_PITCH * 0],
+ out2[ofm + 1] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 2 + OUTPUT_FEATURE_PITCH * 1],
+ out2[ofm + 2] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 2 + OUTPUT_FEATURE_PITCH * 2],
+ 0 };
+ IMAGE_WRITE(output, (int2)(idx_for_image +2, idy_for_image), output_half2);
+ half4 output_half3 = {
+ out3[ofm + 0] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 4 + OUTPUT_FEATURE_PITCH * 0],
+ out3[ofm + 1] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 4 + OUTPUT_FEATURE_PITCH * 1],
+ out3[ofm + 2] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 4 + OUTPUT_FEATURE_PITCH * 2],
+ 0 };
+ IMAGE_WRITE(output, (int2)(idx_for_image+4, idy_for_image), output_half3);
+ half4 output_half4 = {
+ out4[ofm + 0] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 6 + OUTPUT_FEATURE_PITCH * 0],
+ out4[ofm + 1] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 6 + OUTPUT_FEATURE_PITCH * 1],
+ out4[ofm + 2] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 6 + OUTPUT_FEATURE_PITCH * 2],
+ 0 };
+ IMAGE_WRITE(output, (int2)(idx_for_image+6, idy_for_image), output_half4);
+#else
+ output[output_idx_eltwise + OUTPUT_OFFSET + 0] = out1[ofm] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 0];
+ output[output_idx_eltwise + OUTPUT_OFFSET + 2] = out2[ofm] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 2];
+ output[output_idx_eltwise + OUTPUT_OFFSET + 4] = out3[ofm] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 4];
+ output[output_idx_eltwise + OUTPUT_OFFSET + 6] = out4[ofm] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 6];
+#endif
+ }
+ }
+}
// Input reading operation is always blocked.
#define BLOCK_LOAD_INPUTS
-// for now kernel stride is square
-#define K_WSTRIDE K_STRIDE
-#define K_HSTRIDE K_STRIDE
-
// need KERNEL width for first output + STRIDE more for each additional.
-#define IN_BLOCK_WIDTH (K_WIDTH + K_WSTRIDE * (OUT_BLOCK_WIDTH - 1))
-#define IN_BLOCK_HEIGHT (K_HEIGHT + K_HSTRIDE * (OUT_BLOCK_HEIGHT - 1))
+#define IN_BLOCK_WIDTH ((FILTER_SIZE_X - 1) * DILATION_SIZE_X + STRIDE_SIZE_X * (OUT_BLOCK_WIDTH - 1) + 1)
+#define IN_BLOCK_HEIGHT ((FILTER_SIZE_Y - 1) * DILATION_SIZE_Y + STRIDE_SIZE_Y * (OUT_BLOCK_HEIGHT - 1) + 1)
// for imad we are packing 4 8bit activations per 32 bit SIMD lane
// if we later add 4bit, then PACK would be 8.
#define AS_TYPE_N_(type, n, x) as_##type##n(x)
#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
+#define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x)
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+#define ALIGN(a, b) ((a % b == 0) ? a : a - a % b + b)
// int8 conv_input and weights data is packed to int32 "batches",
// int/uint pointers here instead of INPUT0_TYPE/FILTER_TYPE for convenience
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
+__attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
KERNEL (fused_convolution_eltwise_gpu_imad)(
+#if INPUT0_LAYOUT_B_FS_YX_FSV16
+ const __global INPUT0_TYPE* conv_input,
+#else
const __global PACKED_TYPE *conv_input,
- __global OUTPUT_TYPE *output,
+#endif
+ __global OUTPUT_TYPE *restrict output,
const __global int *weights,
#if BIAS_TERM
const __global BIAS_TYPE *biases,
{
const uint oc = (uint)get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column
const uint or = (uint)get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
- const uint fm = get_global_id(2); // fm = Feature Map = od = Output Depth, SIMD is across this dimension, WG is 1x1x16
+ const uint fm = get_global_id(2); // fm = Feature Map = od = Output Depth, SIMD is across this dimension, WG is 1x1x16
const uint fmg = get_group_id(2);
const uint lid = get_local_id(2);
- const uint batch = fm / _OD;
- const uint f = fm % _OD;
+ const uint batch = fm / (ALIGN(FILTER_OFM_NUM, SIMD_SIZE) * FILTER_GROUPS_NUM);
+#if GROUPED
+ const uint g = (fm / ALIGN(FILTER_OFM_NUM, SIMD_SIZE) % FILTER_GROUPS_NUM);
+ const uint ofmg = fmg % CEIL_DIV(FILTER_OFM_NUM, SIMD_SIZE);
+#else
+ const uint g = 0;
+ const uint ofmg = (fmg % (_OD / SIMD_SIZE));
+#endif
+ const uint f = fm % ALIGN(FILTER_OFM_NUM, SIMD_SIZE) + g * FILTER_OFM_NUM;
+ const uint sglid = get_sub_group_local_id();
+
+ const int input_x = oc * STRIDE_SIZE_X - PADDING_SIZE_X;
+ const int input_y = or * STRIDE_SIZE_Y - PADDING_SIZE_Y;
PACKED_TYPE in[IN_BLOCK_HEIGHT];
ACCUMULATOR_TYPE out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT] = { 0 }; // this is the 32 bit signed accumulator that must be converted to 8 bits before final write.
- #define NUM_FILTERS (K_HEIGHT * K_WIDTH)
+ #define NUM_FILTERS (FILTER_SIZE_Y * FILTER_SIZE_X)
int w[NUM_FILTERS];
-
int in_addr;
#ifdef BLOCK_LOAD_WEIGHTS
- int weight_addr = (fmg % ((_OD + SIMD_SIZE - 1) / SIMD_SIZE)) * ((_ID * K_HEIGHT * K_WIDTH * SIMD_SIZE) / PACK);
+ int weight_addr = (ofmg * CEIL_DIV(FILTER_IFM_NUM, PACK) * FILTER_SIZE_Y * FILTER_SIZE_X * SIMD_SIZE) + (g * FILTER_GROUPS_PITCH / 4);
#else
- int weight_addr = (fmg % ((_OD + SIMD_SIZE - 1) / SIMD_SIZE)) * ((_ID * K_HEIGHT * K_WIDTH * SIMD_SIZE) / PACK) + lid;
+ int weight_addr = (ofmg * CEIL_DIV(FILTER_IFM_NUM, PACK) * FILTER_SIZE_Y * FILTER_SIZE_X * SIMD_SIZE) + (g * FILTER_GROUPS_PITCH / 4) + sglid;
#endif
-
- uint input_size = (_ID * (_IH + IHPAD) * (_IW + IWPAD)) / PACK; // dividing by PACK to get right number of 32bit entities.
+ uint input_size = (_ID * (INPUT0_SIZE_Y + IHPAD) * (INPUT0_SIZE_X + IWPAD)) / PACK; // dividing by PACK to get right number of 32bit entities.
// For imad we do 4X less input feature map iterations since we are packing 4 of them in each uchar4.
- // _ID provided by host is multiple of packing factor.
__attribute__((opencl_unroll_hint(1)))
- for(int kd = 0; kd < (_ID / PACK); kd++)
+ for(int kd = 0; kd < CEIL_DIV(FILTER_IFM_NUM, PACK); kd++)
{
-
-#ifdef BLOCK_LOAD_INPUTS
- in_addr = INPUT0_OFFSET + kd*INPUT0_FEATURE_PITCH + (or * K_STRIDE - PADDING_SIZE_Y)*INPUT0_Y_PITCH + (oc * K_STRIDE - PADDING_SIZE_X);
+#if INPUT0_LAYOUT_B_FS_YX_FSV16
+ in_addr = INPUT0_GET_INDEX(batch, (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * PACK, input_y, input_x + sglid);
#else
- in_addr = INPUT0_OFFSET + kd*INPUT0_FEATURE_PITCH + (or * K_STRIDE - PADDING_SIZE_Y)*INPUT0_Y_PITCH + (oc * K_STRIDE - PADDING_SIZE_X) + lid;
-#endif
+ #ifdef BLOCK_LOAD_INPUTS
+ in_addr = INPUT0_OFFSET + (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * INPUT0_FEATURE_PITCH + input_y * INPUT0_Y_PITCH + input_x;
+ #else
+ in_addr = INPUT0_OFFSET + (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * INPUT0_FEATURE_PITCH + input_y * INPUT0_Y_PITCH + input_x + sglid;
+ #endif
in_addr += batch * input_size; // adjust for batching
-
+#endif
for(uint reg = 0; reg < IN_BLOCK_HEIGHT; reg++) {
-#ifdef BLOCK_LOAD_INPUTS
- in[reg] = AS_PACKED_TYPE(intel_sub_group_block_read(&conv_input[in_addr]));
+#if INPUT0_LAYOUT_B_FS_YX_FSV16
+ in[reg] = *(__global PACKED_TYPE*)(conv_input + in_addr);
+ in_addr += (INPUT0_SIZE_X + IWPAD) * 16;
#else
+ #ifdef BLOCK_LOAD_INPUTS
+ in[reg] = AS_PACKED_TYPE(intel_sub_group_block_read(&conv_input[in_addr]));
+ #else
in[reg] = AS_PACKED_TYPE(conv_input[in_addr]);// read SIMD_SIZE elements wide
+ #endif
+ // TODO This will cause errors for byxf_af32 format on input
+ in_addr += (INPUT0_SIZE_X + IWPAD); // move to next row down
#endif
- in_addr += (_IW + IWPAD); // move to next row down
}
#ifdef BLOCK_LOAD_WEIGHTS
int wi = 0;
// This loop is temporarily not unrolled because the unroll causes TeamCity hangs.
- //__attribute__((opencl_unroll_hint(K_HEIGHT)))
- for (int kr = 0; kr < K_HEIGHT; ++kr) // kr = Kernel Row
+ //__attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
+ for (int kr = 0; kr < FILTER_SIZE_Y; ++kr) // kr = Kernel Row
{
- __attribute__((opencl_unroll_hint(K_WIDTH)))
- for (int kc = 0; kc < K_WIDTH; ++kc) // kc = Kernel Column
+ __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
+ for (int kc = 0; kc < FILTER_SIZE_X; ++kc) // kc = Kernel Column
{
+ __attribute__((opencl_unroll_hint))
for (int br = 0; br < OUT_BLOCK_HEIGHT; br++) {
+ __attribute__((opencl_unroll_hint))
for (int bc = 0; bc < OUT_BLOCK_WIDTH; bc++) {
- PACKED_TYPE input = sub_group_broadcast(in[br * K_HSTRIDE + kr], bc * K_WSTRIDE + kc);
+ PACKED_TYPE input = sub_group_broadcast(in[br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y], bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X);
- out[br * OUT_BLOCK_WIDTH + bc] = TO_ACCUMULATOR_TYPE(IMAD(out[br * OUT_BLOCK_WIDTH + bc], AS_INPUT0_TYPE_4(input), as_char4(w[wi])));
+ out[br * OUT_BLOCK_WIDTH + bc] = TO_ACCUMULATOR_TYPE(IMAD(out[br * OUT_BLOCK_WIDTH + bc], AS_INPUT0_TYPE_4(input), AS_FILTER_TYPE_4(w[wi])));
}
}
wi++;
// to calculate out_idx and eltw_idx. Calculate offsets with GET_DATA_B_FS_YX_FSV4_INDEX before
// entering the loop, and have a simple expressions for indexes inside the loop.
const uint output_idx_offset = GET_DATA_B_FS_YX_FSV4_INDEX(OUTPUT, batch, f, or, oc);
- const uint output_row_size_bytes = (_OW + OWPAD) * PACK;
+ const uint output_row_size_bytes = (OUTPUT_SIZE_X + OWPAD) * PACK;
#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
FUSED_OPS_PRELOAD;
for (int r = 0; r < OUT_BLOCK_HEIGHT; r++)
{
- #if NEED_TO_VERIFY_OUTPUT_RANGES == 1
+ #if OUTPUT_SIZE_Y % OUT_BLOCK_HEIGHT != 0
const bool zero_r = or + r >= OUTPUT_SIZE_Y;
if(!zero_r)
#endif
{
for (int c = 0; c < OUT_BLOCK_WIDTH; c++)
{
- #if NEED_TO_VERIFY_OUTPUT_RANGES == 1
+ #if OUTPUT_SIZE_X % OUT_BLOCK_WIDTH != 0
const bool zero_c = oc + c >= OUTPUT_SIZE_X;
if(!zero_c)
#endif
uint out_idx = OUTPUT_GET_INDEX(batch, f, or + r, oc + c);
#elif OUTPUT_LAYOUT_B_FS_YX_FSV4 == 1
uint out_idx = output_idx_offset + r * output_row_size_bytes + (c*PACK);
+ #elif OUTPUT_LAYOUT_B_FS_YX_FSV16 == 1
+ uint out_idx = OUTPUT_GET_INDEX(batch, f, or + r, oc + c);
#else
#error "Incorrect output layout"
#endif
ACTIVATION_TYPE res = TO_ACTIVATION_TYPE(dotProd);
#endif
+ OUTPUT_TYPE final_result;
#if HAS_FUSED_OPS
#if FUSED_OPS_CAN_USE_PRELOAD
FUSED_OPS_CALC;
#else
FUSED_OPS;
#endif
- output[out_idx] = FUSED_OPS_RESULT;
+ final_result = FUSED_OPS_RESULT;
#else
- output[out_idx] = TO_OUTPUT_TYPE(res);
+ final_result = TO_OUTPUT_TYPE(res);
+#endif
+#if FILTER_OFM_NUM % SIMD_SIZE != 0
+ if (fmg % CEIL_DIV(FILTER_OFM_NUM, SIMD_SIZE) != CEIL_DIV(FILTER_OFM_NUM, SIMD_SIZE) - 1 || sglid < FILTER_OFM_NUM % SIMD_SIZE)
#endif
+ output[out_idx] = final_result;
}// if(!zero_c)
} // for (int c = 0; c < OUT_BLOCK_WIDTH; c++)
}// if(!zero_r)
#endif
#undef BLOCK_LOAD_INPUTS
-#undef K_WSTRIDE
-#undef K_HSTRIDE
#undef IN_BLOCK_WIDTH
#undef IN_BLOCK_HEIGHT
#undef PACK
#undef AS_TYPE_N_
#undef AS_TYPE_N
#undef AS_INPUT0_TYPE_4
+#undef AS_FILTER_TYPE_4
#undef NUM_FILTERS
+#undef CEIL_DIV
+#undef ALIGN
// limitations under the License.
*/
+#include "mmad.cl"
+
// TODO: currently we calculate on float32 because it's lot of "add" operation and it stuck on the value "8192.0f"
#if !defined(ACCUMULATOR_TYPE)
#define ACCUMULATOR_TYPE float
#endif
// Creates vector type.
-#define MAKE_VECTOR_TYPE(elem_type, size) CAT(elem_type, size)
\ No newline at end of file
+#define MAKE_VECTOR_TYPE_IMPL_1(elem_type) elem_type
+#define MAKE_VECTOR_TYPE_IMPL_2(elem_type) CAT(elem_type, 2)
+#define MAKE_VECTOR_TYPE_IMPL_3(elem_type) CAT(elem_type, 3)
+#define MAKE_VECTOR_TYPE_IMPL_4(elem_type) CAT(elem_type, 4)
+#define MAKE_VECTOR_TYPE_IMPL_8(elem_type) CAT(elem_type, 8)
+#define MAKE_VECTOR_TYPE_IMPL_16(elem_type) CAT(elem_type, 16)
+#define MAKE_VECTOR_TYPE(elem_type, size) CAT(MAKE_VECTOR_TYPE_IMPL_, size)(elem_type)
+
+#define AS_TYPE(type, val) CAT(as_, type)(val)
+
+// ====================================================================================================================
+// TYPE_SIZE(type) - evaluates to size of "type" in bytes
+// type [PP] - Must evaluate to non-vectorized type.
+// ====================================================================================================================
+#define TYPE_SIZE_uchar 1
+#define TYPE_SIZE_char 1
+#define TYPE_SIZE_ushort 2
+#define TYPE_SIZE_short 2
+#define TYPE_SIZE_half 2
+#define TYPE_SIZE_int 4
+#define TYPE_SIZE_uint 4
+#define TYPE_SIZE_float 4
+#define TYPE_SIZE(type) CAT(TYPE_SIZE_, type)
+
+// ====================================================================================================================
+// BLOCK_READN(type, vector_size, ptr, offset)
+// - evaluates to intel_sub_group_block_read operation for specified "type" and "vector size", reading
+// "vector_size" elements from memory starting at "ptr" + "offset"
+// BLOCK_WRITEN(type, vector_size, ptr, offset, val)
+// - evaluates to intel_sub_group_block_write operation for specified "type" and "vector size", writing
+// "vector_size"-element vector "val" to memory starting at "ptr" + "offset"
+// For more details and description of intel_sub_group_block_read/write functions please,
+// refer to cl_intel_subgroups extension documentation.
+//
+// type [PP] - Must evaluate to non-vectorized type, ex. float, half, char, etc..
+// vector_size [PP] - Number of elements to read/write, ex 2 for intel_sub_group_block_read2.
+// ptr - Pointer to global memory where to read from/write to.
+// offset - Additional offset added to ptr in "type" elements, equivalent to passing ((ptr) + (offset)) as "ptr".
+// val - For write function vector of "vector_size" of "type" elements (or scalar) to write.
+//
+// ====================================================================================================================
+// Pre-defined commonly used definitions:
+// DT_<tensor>_BLOCK_READ<n>(ptr, offset)
+// DT_<tensor>_BLOCK_WRITE<n>(ptr, offset, offset)
+// Where:
+// <tensor> is one of: INPUT - referencing type jitted as INPUT0,
+// OUTPUT,
+// BIAS,
+// FILTER
+// <n> is a vector size, one of {2,4,8,16} or none, meaning the output will be a scalar
+//
+// ====================================================================================================================
+#define BLOCK_RW_TYPE_size1 uchar
+#define BLOCK_RW_TYPE_size2 ushort
+#define BLOCK_RW_TYPE_size4 uint
+#define BLOCK_RW_TYPE(type_size) CAT(BLOCK_RW_TYPE_size, type_size)
+
+#define BLOCK_READ_FUNC_size2 intel_sub_group_block_read_us
+#define BLOCK_READ_FUNC_size4 intel_sub_group_block_read
+#define BLOCK_READ_FUNC(type_size) CAT(BLOCK_READ_FUNC_size, type_size)
+
+#define BLOCK_WRITE_FUNC_size2 intel_sub_group_block_write_us
+#define BLOCK_WRITE_FUNC_size4 intel_sub_group_block_write
+#define BLOCK_WRITE_FUNC(type_size) CAT(BLOCK_WRITE_FUNC_size, type_size)
+
+#define BLOCK_READN_FUNC_size1(vector_size) CAT(BLOCK_READ_UC_, vector_size)
+#define BLOCK_READN_FUNC_SIZE_DEF(type_size, vector_size) MAKE_VECTOR_TYPE(BLOCK_READ_FUNC(type_size), vector_size)
+#define BLOCK_READN_FUNC_size2(vector_size) BLOCK_READN_FUNC_SIZE_DEF(2, vector_size)
+#define BLOCK_READN_FUNC_size4(vector_size) BLOCK_READN_FUNC_SIZE_DEF(4, vector_size)
+#define BLOCK_READN_FUNC(type_size, vector_size) CAT(BLOCK_READN_FUNC_size, type_size)(vector_size)
+
+#define BLOCK_WRITEN_FUNC_size1(vector_size) CAT(BLOCK_WRITE_UC_, vector_size)
+#define BLOCK_WRITEN_FUNC_SIZE_DEF(type_size, vector_size) MAKE_VECTOR_TYPE(BLOCK_WRITE_FUNC(type_size), vector_size)
+#define BLOCK_WRITEN_FUNC_size2(vector_size) BLOCK_WRITEN_FUNC_SIZE_DEF(2, vector_size)
+#define BLOCK_WRITEN_FUNC_size4(vector_size) BLOCK_WRITEN_FUNC_SIZE_DEF(4, vector_size)
+#define BLOCK_WRITEN_FUNC(type_size, vector_size) CAT(BLOCK_WRITEN_FUNC_size, type_size)(vector_size)
+
+#define BLOCK_READN_RAW(type_size, vector_size, ptr, offset) \
+ BLOCK_READN_FUNC(type_size, vector_size)((const __global BLOCK_RW_TYPE(type_size)*)(ptr) + (offset))
+#define BLOCK_WRITEN_RAW(type_size, vector_size, ptr, offset, val) \
+ BLOCK_WRITEN_FUNC(type_size, vector_size)( \
+ (__global BLOCK_RW_TYPE(type_size)*)(ptr) + (offset), \
+ AS_TYPE(MAKE_VECTOR_TYPE(BLOCK_RW_TYPE(type_size), vector_size), val))
+
+#define BLOCK_READN(type, vector_size, ptr, offset) \
+ AS_TYPE(MAKE_VECTOR_TYPE(type, vector_size), BLOCK_READN_RAW(TYPE_SIZE(type), vector_size, ptr, offset))
+#define BLOCK_WRITEN(type, vector_size, ptr, offset, val) \
+ BLOCK_WRITEN_RAW(TYPE_SIZE(type), vector_size, ptr, offset, val)
+
+#define DT_INPUT_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, 1, ptr, offset)
+#define DT_INPUT_BLOCK_READ2(ptr, offset) BLOCK_READN(INPUT0_TYPE, 2, ptr, offset)
+#define DT_INPUT_BLOCK_READ4(ptr, offset) BLOCK_READN(INPUT0_TYPE, 4, ptr, offset)
+#define DT_INPUT_BLOCK_READ8(ptr, offset) BLOCK_READN(INPUT0_TYPE, 8, ptr, offset)
+#define DT_INPUT_BLOCK_READ16(ptr, offset) BLOCK_READN(INPUT0_TYPE, 16, ptr, offset)
+
+#define DT_INPUT_BLOCK_WRITE(ptr, offset, val) BLOCK_WRITEN(INPUT0_TYPE, 1, ptr, offset, val)
+#define DT_INPUT_BLOCK_WRITE2(ptr, offset, val) BLOCK_WRITEN(INPUT0_TYPE, 2, ptr, offset, val)
+#define DT_INPUT_BLOCK_WRITE4(ptr, offset, val) BLOCK_WRITEN(INPUT0_TYPE, 4, ptr, offset, val)
+#define DT_INPUT_BLOCK_WRITE8(ptr, offset, val) BLOCK_WRITEN(INPUT0_TYPE, 8, ptr, offset, val)
+#define DT_INPUT_BLOCK_WRITE16(ptr, offset, val) BLOCK_WRITEN(INPUT0_TYPE, 16, ptr, offset, val)
+
+#define DT_OUTPUT_BLOCK_READ(ptr, offset) BLOCK_READN(OUTPUT_TYPE, 1, ptr, offset)
+#define DT_OUTPUT_BLOCK_READ2(ptr, offset) BLOCK_READN(OUTPUT_TYPE, 2, ptr, offset)
+#define DT_OUTPUT_BLOCK_READ4(ptr, offset) BLOCK_READN(OUTPUT_TYPE, 4, ptr, offset)
+#define DT_OUTPUT_BLOCK_READ8(ptr, offset) BLOCK_READN(OUTPUT_TYPE, 8, ptr, offset)
+#define DT_OUTPUT_BLOCK_READ16(ptr, offset) BLOCK_READN(OUTPUT_TYPE, 16, ptr, offset)
+
+#define DT_OUTPUT_BLOCK_WRITE(ptr, offset, val) BLOCK_WRITEN(OUTPUT_TYPE, 1, ptr, offset, val)
+#define DT_OUTPUT_BLOCK_WRITE2(ptr, offset, val) BLOCK_WRITEN(OUTPUT_TYPE, 2, ptr, offset, val)
+#define DT_OUTPUT_BLOCK_WRITE4(ptr, offset, val) BLOCK_WRITEN(OUTPUT_TYPE, 4, ptr, offset, val)
+#define DT_OUTPUT_BLOCK_WRITE8(ptr, offset, val) BLOCK_WRITEN(OUTPUT_TYPE, 8, ptr, offset, val)
+#define DT_OUTPUT_BLOCK_WRITE16(ptr, offset, val) BLOCK_WRITEN(OUTPUT_TYPE, 16, ptr, offset, val)
+
+#define DT_BIAS_BLOCK_READ(ptr, offset) BLOCK_READN(BIAS_TYPE, 1, ptr, offset)
+#define DT_BIAS_BLOCK_READ2(ptr, offset) BLOCK_READN(BIAS_TYPE, 2, ptr, offset)
+#define DT_BIAS_BLOCK_READ4(ptr, offset) BLOCK_READN(BIAS_TYPE, 4, ptr, offset)
+#define DT_BIAS_BLOCK_READ8(ptr, offset) BLOCK_READN(BIAS_TYPE, 8, ptr, offset)
+#define DT_BIAS_BLOCK_READ16(ptr, offset) BLOCK_READN(BIAS_TYPE, 16, ptr, offset)
+
+#define DT_BIAS_BLOCK_WRITE(ptr, offset, val) BLOCK_WRITEN(BIAS_TYPE, 1, ptr, offset, val)
+#define DT_BIAS_BLOCK_WRITE2(ptr, offset, val) BLOCK_WRITEN(BIAS_TYPE, 2, ptr, offset, val)
+#define DT_BIAS_BLOCK_WRITE4(ptr, offset, val) BLOCK_WRITEN(BIAS_TYPE, 4, ptr, offset, val)
+#define DT_BIAS_BLOCK_WRITE8(ptr, offset, val) BLOCK_WRITEN(BIAS_TYPE, 8, ptr, offset, val)
+#define DT_BIAS_BLOCK_WRITE16(ptr, offset, val) BLOCK_WRITEN(BIAS_TYPE, 16, ptr, offset, val)
+
+#define DT_FILTER_BLOCK_READ(ptr, offset) BLOCK_READN(FILTER_TYPE, 1, ptr, offset)
+#define DT_FILTER_BLOCK_READ2(ptr, offset) BLOCK_READN(FILTER_TYPE, 2, ptr, offset)
+#define DT_FILTER_BLOCK_READ4(ptr, offset) BLOCK_READN(FILTER_TYPE, 4, ptr, offset)
+#define DT_FILTER_BLOCK_READ8(ptr, offset) BLOCK_READN(FILTER_TYPE, 8, ptr, offset)
+#define DT_FILTER_BLOCK_READ16(ptr, offset) BLOCK_READN(FILTER_TYPE, 16, ptr, offset)
+
+#define DT_FILTER_BLOCK_WRITE(ptr, offset, val) BLOCK_WRITEN(FILTER_TYPE, 1, ptr, offset, val)
+#define DT_FILTER_BLOCK_WRITE2(ptr, offset, val) BLOCK_WRITEN(FILTER_TYPE, 2, ptr, offset, val)
+#define DT_FILTER_BLOCK_WRITE4(ptr, offset, val) BLOCK_WRITEN(FILTER_TYPE, 4, ptr, offset, val)
+#define DT_FILTER_BLOCK_WRITE8(ptr, offset, val) BLOCK_WRITEN(FILTER_TYPE, 8, ptr, offset, val)
+#define DT_FILTER_BLOCK_WRITE16(ptr, offset, val) BLOCK_WRITEN(FILTER_TYPE, 16, ptr, offset, val)
+// ====================================================================================================================
CAT(prefix, _OFFSET) \
)
+#define GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(prefix, o, i, y, x) \
+ FUNC_CALL(get_os_is_yx_osv16_isv16_index)( \
+ o, i, y, x, \
+ CAT(prefix, _SIZE_X), \
+ CAT(prefix, _SIZE_Y), \
+ CAT(prefix, _IFM_NUM), \
+ CAT(prefix, _OFM_NUM))
+
+inline uint FUNC(get_os_is_yx_osv16_isv16_index)(uint o, uint i, uint y, uint x,
+ uint x_size, uint y_size, uint i_size, uint o_size)
+{
+ const uint isv = i % 16;
+ const uint osv = o % 16;
+ const uint is = i / 16;
+ const uint os = o / 16;
+
+ const uint x_pitch = 16 * 16;
+ const uint y_pitch = x_pitch * x_size;
+ const uint is_pitch = y_pitch * y_size;
+ const uint os_pitch = is_pitch * ((i_size + 16 - 1) / 16);
+
+ const uint output_offset = isv + osv * 16 + x * x_pitch + y * y_pitch + is * is_pitch + os * os_pitch;
+
+ return output_offset;
+}
+
#define GET_FILTER_G_OS_IS_YX_ISV8_OSV16_ISV2_INDEX(prefix, g, o, i, y, x, sub_group_size) \
FUNC_CALL(get_os_is_zyx_isv8_osv16_isv2_index)( \
g, o, i, 0, y, x, \
return idx;
}
+#define GET_FILTER_G_OS_IS_YX_OSV16_ISV4_INDEX(prefix, g, o, i, y, x) \
+ FUNC_CALL(get_g_os_is_yx_osv16_isv4)( \
+ g, o, i, y, x, \
+ CAT(prefix, _IFM_PITCH), \
+ CAT(prefix, _OFM_PITCH), \
+ CAT(prefix, _SIZE_X), \
+ CAT(prefix, _SIZE_Y), \
+ CAT(prefix, _OFM_NUM), \
+ CAT(prefix, _IFM_NUM))
+
+inline uint FUNC(get_g_os_is_yx_osv16_isv4)(uint g, uint o, uint i, uint y, uint x,
+ uint i_size,
+ uint o_size,
+ uint x_size,
+ uint y_size,
+ uint o_num,
+ uint i_num)
+{
+ const uint otd = 16;
+ uint out_depth_tile = o / otd;
+ uint od = o - out_depth_tile * otd;
+ uint output_slice_size = (o_num + otd - 1) / otd;
+
+ const uint tile = 4;
+ uint id_tile = i / tile;
+ uint id = i - id_tile * tile;
+ uint input_slice_size = (i_num + tile - 1) / tile;
+
+ uint idx = g * output_slice_size * input_slice_size * y_size * x_size * otd * tile
+ + out_depth_tile * (o_size / tile) * otd * tile
+ + id_tile * i_size * otd * tile
+ + y * x_size * otd * tile
+ + x * otd * tile
+ + od * tile
+ + id;
+
+ return idx;
+}
+
#define GET_FILTER_OS_IS_YX_OSV16_ISV4_INDEX(prefix, o, i, y, x) \
FUNC_CALL(get_os_is_yx_osv16_isv4)( \
o, i, y, x, \
CAT(prefix, _OFM_NUM),\
CAT(prefix, _OFFSET))
-inline uint FUNC(get_os_i_yxs_osv4_yxsv4_index)(uint o, uint i, uint y, uint x, uint i_size, uint size_x, uint size_y) {
+inline uint FUNC(get_os_i_yxs_osv_yxsv4_index)(uint o, uint i, uint y, uint x, uint i_size, uint size_x, uint size_y, uint osv) {
const uint yxsv = 4;
- const uint osv = 4;
uint yx = y * size_x + x;
uint yx_size_aligned = (size_x * size_y + yxsv - 1) / yxsv * yxsv;
uint os_index = o / osv;
}
#define GET_FILTER_OS_I_YXS_OSV4_YXSV4_INDEX(prefix, o, i, y, x) \
- FUNC_CALL(get_os_i_yxs_osv4_yxsv4_index)( \
+ FUNC_CALL(get_os_i_yxs_osv_yxsv4_index)( \
o, i, y, x, \
CAT(prefix, _IFM_NUM), \
CAT(prefix, _SIZE_X), \
- CAT(prefix, _SIZE_Y))
+ CAT(prefix, _SIZE_Y), \
+ 4)
#define GET_FILTER_OS_IYX_OSV32__AI32_INDEX(prefix, o, i, y, x, sub_group_size) \
CAT(prefix, _OFFSET) + \
CAT(prefix, _OFFSET), \
sub_group_size)
-inline uint FUNC(get_gs_oi_yxs_gsv4_yxsv4_index)(uint g, uint o, uint i, uint y, uint x, uint o_size, uint i_size, uint size_x, uint size_y) {
+inline uint FUNC(get_gs_oi_yxs_gsv_yxsv4_index)(uint g, uint o, uint i, uint y, uint x, uint o_size, uint i_size, uint size_x, uint size_y, const uint gsv) {
const uint yxsv = 4;
- const uint gsv = 4;
uint yx = y * size_x + x;
uint yx_size_aligned = (size_x * size_y + yxsv - 1) / yxsv * yxsv;
uint gs_index = g / gsv;
}
#define GET_FILTER_GS_OI_YXS_GSV4_YXSV4_INDEX(prefix, g, o, i, y, x) \
- FUNC_CALL(get_gs_oi_yxs_gsv4_yxsv4_index)( \
+ FUNC_CALL(get_gs_oi_yxs_gsv_yxsv4_index)( \
g, o, i, y, x, \
CAT(prefix, _OFM_NUM), \
CAT(prefix, _IFM_NUM), \
CAT(prefix, _SIZE_X), \
- CAT(prefix, _SIZE_Y))
+ CAT(prefix, _SIZE_Y), \
+ 4)
+
+#define GET_FILTER_GS_OI_YXS_GSV16_YXSV4_INDEX(prefix, g, o, i, y, x) \
+ FUNC_CALL(get_gs_oi_yxs_gsv_yxsv4_index)( \
+ g, o, i, y, x, \
+ CAT(prefix, _OFM_NUM), \
+ CAT(prefix, _IFM_NUM), \
+ CAT(prefix, _SIZE_X), \
+ CAT(prefix, _SIZE_Y), \
+ 16)
+
+#define GET_FILTER_GS_OI_YXS_GSV32_YXSV4_INDEX(prefix, g, o, i, y, x) \
+ FUNC_CALL(get_gs_oi_yxs_gsv_yxsv4_index)( \
+ g, o, i, y, x, \
+ CAT(prefix, _OFM_NUM), \
+ CAT(prefix, _IFM_NUM), \
+ CAT(prefix, _SIZE_X), \
+ CAT(prefix, _SIZE_Y), \
+ 32)
#define GET_FILTER_G_OS_IS_YX_ISV16_OSV16_INDEX(prefix, g, o, i, y, x, sub_group_size) \
CAT(prefix, _OFFSET) + \
}
// TODO: remove it when cl_intel_subgroups_char extension will work
+inline void FUNC(sub_group_block_write_uchar16)(__global uchar* outPtr, uchar16 v)
+{
+#ifdef cl_intel_subgroups_char
+ intel_sub_group_block_write_uc16(outPtr, v);
+#else
+ uint idx = get_sub_group_local_id();
+
+ outPtr[idx] = v.s0; idx += get_max_sub_group_size();
+ outPtr[idx] = v.s1; idx += get_max_sub_group_size();
+ outPtr[idx] = v.s2; idx += get_max_sub_group_size();
+ outPtr[idx] = v.s3; idx += get_max_sub_group_size();
+ outPtr[idx] = v.s4; idx += get_max_sub_group_size();
+ outPtr[idx] = v.s5; idx += get_max_sub_group_size();
+ outPtr[idx] = v.s6; idx += get_max_sub_group_size();
+ outPtr[idx] = v.s7; idx += get_max_sub_group_size();
+ outPtr[idx] = v.s8; idx += get_max_sub_group_size();
+ outPtr[idx] = v.s9; idx += get_max_sub_group_size();
+ outPtr[idx] = v.sa; idx += get_max_sub_group_size();
+ outPtr[idx] = v.sb; idx += get_max_sub_group_size();
+ outPtr[idx] = v.sc; idx += get_max_sub_group_size();
+ outPtr[idx] = v.sd; idx += get_max_sub_group_size();
+ outPtr[idx] = v.se; idx += get_max_sub_group_size();
+ outPtr[idx] = v.sf; idx += get_max_sub_group_size();
+#endif
+}
+
+inline uchar16 FUNC(sub_group_block_read_uchar16)(const __global uchar* ptr)
+{
+#ifdef cl_intel_subgroups_char
+ // WA for compiler support
+ // return intel_sub_group_block_read_uc16(ptr);
+ return (uchar16)(intel_sub_group_block_read_uc8(ptr), intel_sub_group_block_read_uc8(ptr + 8 * get_max_sub_group_size()));
+#else
+ uint idx = get_sub_group_local_id();
+
+ uchar16 ret;
+
+ ret.s0 = ptr[idx]; idx += get_max_sub_group_size();
+ ret.s1 = ptr[idx]; idx += get_max_sub_group_size();
+ ret.s2 = ptr[idx]; idx += get_max_sub_group_size();
+ ret.s3 = ptr[idx]; idx += get_max_sub_group_size();
+ ret.s4 = ptr[idx]; idx += get_max_sub_group_size();
+ ret.s5 = ptr[idx]; idx += get_max_sub_group_size();
+ ret.s6 = ptr[idx]; idx += get_max_sub_group_size();
+ ret.s7 = ptr[idx]; idx += get_max_sub_group_size();
+ ret.s8 = ptr[idx]; idx += get_max_sub_group_size();
+ ret.s9 = ptr[idx]; idx += get_max_sub_group_size();
+ ret.sa = ptr[idx]; idx += get_max_sub_group_size();
+ ret.sb = ptr[idx]; idx += get_max_sub_group_size();
+ ret.sc = ptr[idx]; idx += get_max_sub_group_size();
+ ret.sd = ptr[idx]; idx += get_max_sub_group_size();
+ ret.se = ptr[idx]; idx += get_max_sub_group_size();
+ ret.sf = ptr[idx]; idx += get_max_sub_group_size();
+
+ return ret;
+#endif
+}
+
inline void FUNC(sub_group_block_write_uchar8)(__global uchar* outPtr, uchar8 v)
{
#ifdef cl_intel_subgroups_char
#else
uint idx = get_sub_group_local_id();
- outPtr[idx] = v.s0; idx += get_max_sub_group_size();
+ outPtr[idx] = v.s0; idx += get_max_sub_group_size();
outPtr[idx] = v.s1; idx += get_max_sub_group_size();
outPtr[idx] = v.s2; idx += get_max_sub_group_size();
outPtr[idx] = v.s3; idx += get_max_sub_group_size();
ret.s7 = ptr[idx]; idx += get_max_sub_group_size();
return ret;
+#endif
+}
+
+inline void FUNC(sub_group_block_write_uchar4)(__global uchar* outPtr, uchar4 v)
+{
+#ifdef cl_intel_subgroups_char
+ intel_sub_group_block_write_uc4(outPtr, v);
+#else
+ uint idx = get_sub_group_local_id();
+
+ outPtr[idx] = v.s0; idx += get_max_sub_group_size();
+ outPtr[idx] = v.s1; idx += get_max_sub_group_size();
+ outPtr[idx] = v.s2; idx += get_max_sub_group_size();
+ outPtr[idx] = v.s3; idx += get_max_sub_group_size();
+#endif
+}
+
+inline uchar4 FUNC(sub_group_block_read_uchar4)(const __global uchar* ptr)
+{
+#ifdef cl_intel_subgroups_char
+ return intel_sub_group_block_read_uc4(ptr);
+#else
+ uint idx = get_sub_group_local_id();
+
+ uchar4 ret;
+
+ ret.s0 = ptr[idx]; idx += get_max_sub_group_size();
+ ret.s1 = ptr[idx]; idx += get_max_sub_group_size();
+ ret.s2 = ptr[idx]; idx += get_max_sub_group_size();
+ ret.s3 = ptr[idx]; idx += get_max_sub_group_size();
+
+ return ret;
+#endif
+}
+
+inline void FUNC(sub_group_block_write_uchar2)(__global uchar* outPtr, uchar2 v)
+{
+#ifdef cl_intel_subgroups_char
+ intel_sub_group_block_write_uc2(outPtr, v);
+#else
+ uint idx = get_sub_group_local_id();
+ outPtr[idx] = v.s0; idx += get_max_sub_group_size();
+ outPtr[idx] = v.s1; idx += get_max_sub_group_size();
+#endif
+}
+
+inline uchar2 FUNC(sub_group_block_read_uchar2)(const __global uchar* ptr)
+{
+#ifdef cl_intel_subgroups_char
+ return intel_sub_group_block_read_uc2(ptr);
+#else
+ uint idx = get_sub_group_local_id();
+
+ uchar2 ret;
+
+ ret.s0 = ptr[idx]; idx += get_max_sub_group_size();
+ ret.s1 = ptr[idx]; idx += get_max_sub_group_size();
+
+ return ret;
+#endif
+}
+
+inline void FUNC(sub_group_block_write_uchar)(__global uchar* outPtr, uchar v)
+{
+#ifdef cl_intel_subgroups_char
+ intel_sub_group_block_write_uc(outPtr, v);
+#else
+ uint idx = get_sub_group_local_id();
+
+ outPtr[idx] = v;
+#endif
+}
+
+inline uchar FUNC(sub_group_block_read_uchar)(const __global uchar* ptr)
+{
+#ifdef cl_intel_subgroups_char
+ return intel_sub_group_block_read_uc(ptr);
+#else
+ uint idx = get_sub_group_local_id();
+
+ uchar ret;
+
+ ret = ptr[idx];
+
+ return ret;
#endif
}
#define SLM_BLOCK_WRITE_4(A, B) (FUNC_CALL(intel_sub_group_block_write_4)(A, B))
#define SLM_BLOCK_READ_4(A) (FUNC_CALL(intel_sub_group_block_read_uint4)(A))
#define SLM_BLOCK_READ_8(A) (FUNC_CALL(intel_sub_group_block_read_uint8)(A))
+
+#define BLOCK_READ_UC_1(ptr) FUNC_CALL(sub_group_block_read_uchar)(ptr)
+#define BLOCK_READ_UC_2(ptr) FUNC_CALL(sub_group_block_read_uchar2)(ptr)
+#define BLOCK_READ_UC_4(ptr) FUNC_CALL(sub_group_block_read_uchar4)(ptr)
+#define BLOCK_READ_UC_8(ptr) FUNC_CALL(sub_group_block_read_uchar8)(ptr)
+#define BLOCK_READ_UC_16(ptr) FUNC_CALL(sub_group_block_read_uchar16)(ptr)
+
+#define BLOCK_WRITE_UC_1(ptr, val) FUNC_CALL(sub_group_block_write_uchar)(ptr, val)
+#define BLOCK_WRITE_UC_2(ptr, val) FUNC_CALL(sub_group_block_write_uchar2)(ptr, val)
+#define BLOCK_WRITE_UC_4(ptr, val) FUNC_CALL(sub_group_block_write_uchar4)(ptr, val)
+#define BLOCK_WRITE_UC_8(ptr, val) FUNC_CALL(sub_group_block_write_uchar8)(ptr, val)
+#define BLOCK_WRITE_UC_16(ptr, val) FUNC_CALL(sub_group_block_write_uchar16)(ptr, val)
--- /dev/null
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/fetch.cl"
+#include "include/imad.cl"
+#include "include/data_types.cl"
+#include "include/common.cl"
+#include "include/mmad.cl"
+
+#include "mvn_gpu_b_fs_yx_fsv16_imad_accumulate.cl"
+#include "mvn_gpu_b_fs_yx_fsv16_imad_reduce.cl"
+
+// MVN - performs mean-variance normalization, that is normalizes the input data to have
+// 0 mean and if NORMALIZE_VARIANCE is set to have variance 1.
+//
+// Below is a set of 5 kernels:
+// mvn_mean_1, mvn_mean_2, mvn_var_1, mvn_var_2, mvn_final
+// that can perform mvn operation in two modes.
+//
+// Basic mode:
+// In this mode only mvn_final kernel is used. It performs required reductions for mean
+// and variance in this single kernel using single work-group for slice of data-sets
+// and reducing intermidiate values with local memory.
+// It does not require any additional jit constants.
+// lws: LWS x 1 x 1
+// gws: LWS x feature x batch
+//
+// Parallel mode:
+// In this mode all kernels are used to provide extra paralellism with global memory
+// and host side synchronization with evets/in-order queue.
+// To calculate mean:
+// mvn_mean_1 kernel should be first enqueued, provided extra global memory on second input
+// allowing to store intermidate results from all work-groups.
+// To activate this kernel MVN_KERNEL_MEAN_1 must be defined and evaluate to true/1.
+// lws: LWS x 1 x 1
+// gws: LWS * ITEM_GROUPS x feature x batch
+// This kernel will calculate partial results for each ITEM_GROUPS work-groups and store it into global memory.
+//
+// mvn_mean_2 kernel must be next enqueued in order to further reduce previous results using single work-group.
+// This kernel expects on first input the result of mvn_mean_1 and on second input global memory of size
+// batch * align(feature, FSV) should be provided to store final mean values.
+// It needs to be ensured that mvn_mean_1 kernel has finished and stored its partial results into memory.
+// To activate this kernel MVN_KERNEL_MEAN_2 must be defined and evaluate to true/1.
+// lws: LWS x 1 x 1
+// gws: LWS x feature x batch
+//
+// If required analogously the mvn_var_1 and mvn_var_2 kernels should be enqueud, additionally providing results from
+// mvn_mean_2 kernel.
+//
+// Finally the mvn_final kernel should be enqueued with provided buffers with outputs from previous kernels (mvn_mean_2, mvn_var_2).
+// To enable parallel mode PRECALC_MEAN and optionally PRECALC_VARIANCE definitions should be used.
+// As at this stage there is no further need to synchronize and this kernel will perform simple normalization given known mean and inverse of variance.
+// Due to this this kernel can be enqueued with full paralellization, not limiting it to single work-group.
+// lws: SIMD x 1 x 1
+// gws: (x * y) / SIMD * SIMD x feature x batch
+//
+// Required jit constants:
+// SIMD - Sub-group/simd size.
+// LWS - Local work-size along 0th dimension, must be multiple of SIMD.
+// GWS - Global work-size along 0th dimension.
+// In basic mode this must be equal to LWS.
+// In parallel mode this must be equal to LWS * ITEM_GROUPS, except in mvn_final kernel where it has no restrictions.
+// ITEM_GROUPS - Number of work-groups performing accumulation in parallel mode. Should be the same in both stages of parallel kernels.
+
+
+#define FSV 16
+#define INPUT_SLICE_PITCH 16
+#define SG_NUM (LWS / SIMD)
+
+#define INPUT_TYPE2 MAKE_VECTOR_TYPE(INPUT0_TYPE, 2)
+#define INPUT_TYPE4 MAKE_VECTOR_TYPE(INPUT0_TYPE, 4)
+#define INPUT_TYPE8 MAKE_VECTOR_TYPE(INPUT0_TYPE, 8)
+#define INPUT_PACKED_TYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, FSV)
+#define OUTPUT_PACKED_TYPE MAKE_VECTOR_TYPE(OUTPUT_TYPE, FSV)
+#define MEAN_PACKED_TYPE MAKE_VECTOR_TYPE(MEAN_TYPE, FSV)
+#define INT_PACKED_TYPE MAKE_VECTOR_TYPE(int, FSV)
+
+#define TO_MEAN_PACKED_TYPE CAT(convert_, MEAN_PACKED_TYPE)
+
+#define ITEMS_NUM (OUTPUT_SIZE_X * OUTPUT_SIZE_Y)
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+
+// ================================================================================================
+#if MVN_KERNEL_MEAN_1
+
+DECLARE_PACKED_ACCUMULATE(accumulate_sum_input, int, INPUT0_TYPE, FSV, INPUT_SLICE_PITCH, ITEMS_NUM, GWS, ACCUMULATE_SUM)
+
+DECLARE_WG_PACKED_REDUCE_ADD(reduce_sum_across_sg, int, FSV, SG_NUM, REDUCE_NO_POST_OP)
+
+__attribute__((intel_reqd_sub_group_size(SIMD)))
+__attribute__((reqd_work_group_size(LWS, 1, 1)))
+KERNEL(mvn_mean_1)(const __global INPUT0_TYPE* input,
+ __global int* intermidiate_sum) {
+ uint b = get_global_id(2);
+ uint f = get_global_id(1) * FSV;
+ uint flat_data_set_group = b * CEIL_DIV(OUTPUT_FEATURE_NUM, FSV) + get_global_id(1);
+
+ uint items_group = get_group_id(0);
+ const uint sgid = get_sub_group_id();
+ const uint sglid = get_sub_group_local_id();
+
+ const uint data_sets_offset = INPUT0_GET_INDEX(b, f, 0, 0);
+
+ __local int slm_acc[(SG_NUM - 1) * FSV];
+
+ INT_PACKED_TYPE partial_sum = FUNC_CALL(accumulate_sum_input)(input, data_sets_offset, get_global_id(0));
+ int full_sum = FUNC_CALL(reduce_sum_across_sg)(partial_sum, slm_acc);
+
+ if (sgid == 0 && (sglid < FSV || SIMD == FSV)) {
+ intermidiate_sum[flat_data_set_group * ITEM_GROUPS * FSV + items_group * FSV + sglid] = full_sum;
+ }
+}
+// ================================================================================================
+#elif MVN_KERNEL_MEAN_2
+
+DECLARE_PACKED_ACCUMULATE(accumulate_sum_input, int, int, FSV, INPUT_SLICE_PITCH, ITEM_GROUPS, LWS, ACCUMULATE_SUM)
+
+#define CALC_MEAN(sum) ((sum) / ITEMS_NUM)
+DECLARE_WG_PACKED_REDUCE_ADD(reduce_mean_across_sg, MEAN_TYPE, FSV, SG_NUM, CALC_MEAN)
+
+__attribute__((intel_reqd_sub_group_size(SIMD)))
+__attribute__((reqd_work_group_size(LWS, 1, 1)))
+KERNEL(mvn_mean_2)(const __global int* intermidiate_sum,
+ __global MEAN_TYPE* intermidiate_mean) {
+ uint b = get_global_id(2);
+ uint f = get_global_id(1) * FSV;
+ uint flat_data_set_group = b * CEIL_DIV(OUTPUT_FEATURE_NUM, FSV) + get_global_id(1);
+
+ const uint sgid = get_sub_group_id();
+ const uint sglid = get_sub_group_local_id();
+
+ const uint data_sets_offset = flat_data_set_group * ITEM_GROUPS * FSV;
+
+ INT_PACKED_TYPE complete_sum = FUNC_CALL(accumulate_sum_input)(intermidiate_sum, data_sets_offset, get_local_id(0));
+ __local MEAN_TYPE slm_acc[(SG_NUM - 1) * FSV];
+ MEAN_TYPE mean = FUNC_CALL(reduce_mean_across_sg)(TO_MEAN_PACKED_TYPE(complete_sum), slm_acc);
+
+ if (sgid == 0 && (sglid < FSV || SIMD == FSV)) {
+ intermidiate_mean[flat_data_set_group * FSV + sglid] = mean;
+ }
+}
+// ================================================================================================
+#elif MVN_KERNEL_VAR_1
+
+#define EXTRA_ARGS_DECL_IMPL , MEAN_TYPE mean
+#define EXTRA_ARGS_IMPL , mean
+#define EXTRA_ARGS_DECL EXTRA_ARGS_DECL_IMPL
+#define EXTRA_ARGS EXTRA_ARGS_IMPL
+#define ACCUMULATE_SUM_SQ_DEV(curr, next, idx, mean) ACCUMULATE_SUM_SQ(curr, TO_MEAN_TYPE(next) - intel_sub_group_shuffle(mean, idx), idx)
+DECLARE_PACKED_ACCUMULATE_EARGS(accumulate_sum_sq_dev, MEAN_TYPE, INPUT0_TYPE, FSV, INPUT_SLICE_PITCH, ITEMS_NUM, GWS, ACCUMULATE_SUM_SQ_DEV, EXTRA_ARGS_DECL, EXTRA_ARGS)
+
+DECLARE_WG_PACKED_REDUCE_ADD(reduce_sum_across_sg, MEAN_TYPE, FSV, SG_NUM, REDUCE_NO_POST_OP)
+
+__attribute__((intel_reqd_sub_group_size(SIMD)))
+__attribute__((reqd_work_group_size(LWS, 1, 1)))
+KERNEL(mvn_var_1)(const __global INPUT0_TYPE* input,
+ const __global MEAN_TYPE* means,
+ __global MEAN_TYPE* intermidiate_sum) {
+ uint b = get_global_id(2);
+ uint f = get_global_id(1) * FSV;
+ uint flat_data_set_group = b * CEIL_DIV(OUTPUT_FEATURE_NUM, FSV) + get_global_id(1);
+
+ uint items_group = get_group_id(0);
+ const uint sgid = get_sub_group_id();
+ const uint sglid = get_sub_group_local_id();
+
+ const uint data_sets_offset = INPUT0_GET_INDEX(b, f, 0, 0);
+
+ __local MEAN_TYPE slm_acc[(SG_NUM - 1) * FSV];
+
+ MEAN_TYPE mean = means[flat_data_set_group * FSV + sglid];
+ MEAN_PACKED_TYPE partial_sum = FUNC_CALL(accumulate_sum_sq_dev)(input, data_sets_offset, get_global_id(0), mean);
+ MEAN_TYPE full_sum = FUNC_CALL(reduce_sum_across_sg)(partial_sum, slm_acc);
+
+ if (sgid == 0 && (sglid < FSV || SIMD == FSV)) {
+ intermidiate_sum[flat_data_set_group * ITEM_GROUPS * FSV + items_group * FSV + sglid] = full_sum;
+ }
+}
+// ================================================================================================
+#elif MVN_KERNEL_VAR_2
+
+DECLARE_PACKED_ACCUMULATE(accumulate_sum, MEAN_TYPE, MEAN_TYPE, FSV, INPUT_SLICE_PITCH, ITEM_GROUPS, LWS, ACCUMULATE_SUM)
+
+#define CALC_INVERSE_VARIANCE(sum_diff_sq) native_powr((sum_diff_sq) / ITEMS_NUM + (MEAN_TYPE)EPSILON, -0.5f)
+DECLARE_WG_PACKED_REDUCE_ADD(reduce_var_across_sg, MEAN_TYPE, FSV, SG_NUM, CALC_INVERSE_VARIANCE)
+
+__attribute__((intel_reqd_sub_group_size(SIMD)))
+__attribute__((reqd_work_group_size(LWS, 1, 1)))
+KERNEL(mvn_var_2)(const __global MEAN_TYPE* intermidiate_sum,
+ __global MEAN_TYPE* intermidiate_ivar) {
+ uint b = get_global_id(2);
+ uint f = get_global_id(1) * FSV;
+ uint flat_data_set_group = b * CEIL_DIV(OUTPUT_FEATURE_NUM, FSV) + get_global_id(1);
+
+ uint items_group = get_group_id(0);
+ const uint sgid = get_sub_group_id();
+ const uint sglid = get_sub_group_local_id();
+
+ const uint data_sets_offset = flat_data_set_group * ITEM_GROUPS * FSV;
+
+ MEAN_PACKED_TYPE complete_sum = FUNC_CALL(accumulate_sum)(intermidiate_sum, data_sets_offset, get_local_id(0));
+
+ __local MEAN_TYPE slm_acc[(SG_NUM - 1) * FSV];
+ MEAN_TYPE inv_variance = FUNC_CALL(reduce_var_across_sg)(complete_sum, slm_acc);
+
+ if (sgid == 0 && (sglid < FSV || SIMD == FSV)) {
+ intermidiate_ivar[flat_data_set_group * FSV + sglid] = inv_variance;
+ }
+}
+// ================================================================================================
+#else // MVN_KERNEL_MAIN
+
+// Mean:
+DECLARE_PACKED_ACCUMULATE(accumulate_sum_input, int, INPUT0_TYPE, FSV, INPUT_SLICE_PITCH, ITEMS_NUM, LWS, ACCUMULATE_SUM)
+
+#define CALC_MEAN(sum) ((sum) / ITEMS_NUM)
+DECLARE_WG_PACKED_REDUCE_ADD(reduce_mean, MEAN_TYPE, FSV, SG_NUM, CALC_MEAN)
+
+// Variance:
+#define EXTRA_ARGS_DECL_IMPL , MEAN_TYPE mean
+#define EXTRA_ARGS_IMPL , mean
+#define EXTRA_ARGS_DECL EXTRA_ARGS_DECL_IMPL
+#define EXTRA_ARGS EXTRA_ARGS_IMPL
+#define ACCUMULATE_SUM_SQ_DEV(curr, next, idx, mean) ACCUMULATE_SUM_SQ(curr, next - intel_sub_group_shuffle(mean, idx), idx)
+DECLARE_PACKED_ACCUMULATE_EARGS(accumulate_sum_sq_dev, MEAN_TYPE, INPUT0_TYPE, FSV, INPUT_SLICE_PITCH, ITEMS_NUM, LWS, ACCUMULATE_SUM_SQ_DEV, EXTRA_ARGS_DECL, EXTRA_ARGS)
+
+#define CALC_INVERSE_VARIANCE(sum_diff_sq) native_powr((sum_diff_sq) / ITEMS_NUM + (MEAN_TYPE)EPSILON, -0.5f)
+DECLARE_WG_PACKED_REDUCE_ADD(reduce_inverse_variance, MEAN_TYPE, FSV, SG_NUM, CALC_INVERSE_VARIANCE)
+
+#define INPUT_PACKED_BLOCK_READ(ptr) CAT(as_, INPUT_PACKED_TYPE)(CAT(BLOCK_READ_UC_, FSV)((const __global uchar*)ptr))
+
+#define OUTPUT_PAD_IN_ITEMS (OUTPUT_PAD_BEFORE_SIZE_X != 0 || OUTPUT_PAD_AFTER_SIZE_X != 0 || OUTPUT_PAD_BEFORE_SIZE_Y != 0)
+
+__attribute__((intel_reqd_sub_group_size(SIMD)))
+__attribute__((reqd_work_group_size(LWS, 1, 1)))
+KERNEL(mvn_final)(
+ const __global INPUT0_TYPE* input,
+ __global OUTPUT_TYPE* restrict output
+#if HAS_FUSED_OPS_DECLS
+ , FUSED_OPS_DECLS
+#endif
+#if PRECALC_MEAN
+ , const __global MEAN_TYPE* means
+#endif
+#if PRECALC_VARIANCE
+ , const __global MEAN_TYPE* variances
+#endif
+) {
+ uint b = get_global_id(2);
+ uint f = get_global_id(1) * FSV;
+ uint flat_data_set_group = b * CEIL_DIV(OUTPUT_FEATURE_NUM, FSV) + get_global_id(1);
+#if GWS != LWS
+ uint items_group = get_group_id(0);
+#else
+ uint items_group = 0;
+#endif
+ const uint sgid = get_sub_group_id() + items_group * SG_NUM;
+ const uint sglid = get_sub_group_local_id();
+
+ const uint data_sets_offset = INPUT0_GET_INDEX(b, f, 0, 0);
+ uint input_offset;
+
+#if !PRECALC_MEAN || (NORMALIZE_VARIANCE && !PRECALC_VARIANCE)
+ __local MEAN_TYPE slm_acc[(SG_NUM - 1) * FSV];
+#endif
+
+#if PRECALC_MEAN
+ MEAN_TYPE mean = means[flat_data_set_group * FSV + sglid];
+#else
+ INT_PACKED_TYPE partial_sum = FUNC_CALL(accumulate_sum_input)(input, data_sets_offset, get_local_id(0));
+ MEAN_TYPE mean = FUNC_CALL(reduce_mean)(TO_MEAN_PACKED_TYPE(partial_sum), slm_acc);
+#endif
+
+#if NORMALIZE_VARIANCE
+# if PRECALC_VARIANCE
+ MEAN_TYPE inv_variance = variances[flat_data_set_group * FSV + sglid];
+# else
+ MEAN_PACKED_TYPE partial_dev = FUNC_CALL(accumulate_sum_sq_dev)(input, data_sets_offset, get_local_id(0), mean);
+ MEAN_TYPE inv_variance = FUNC_CALL(reduce_inverse_variance)(partial_dev, slm_acc);
+# endif
+#else
+ MEAN_TYPE inv_variance = 1;
+#endif
+
+#if OUTPUT_IS_FP
+ input_offset = data_sets_offset + sgid * SIMD * FSV;
+ uint output_spatial_base = sgid * SIMD;
+ uint output_offset = OUTPUT_GET_INDEX(b, f, 0, 0) + sgid * SIMD * FSV;
+ // For fused ops to align with non-fp path
+ const uint set_idx = sglid;
+
+ for (uint spatial_idx = 0; spatial_idx < ITEMS_NUM / GWS; ++spatial_idx) {
+ INPUT_PACKED_TYPE in_pack = INPUT_PACKED_BLOCK_READ(input + input_offset);
+
+ __attribute__((opencl_unroll_hint))
+ for (uint si = 0; si < SIMD; ++si) {
+ uint output_spatial = output_spatial_base + si;
+ MEAN_TYPE normalized = (TO_MEAN_TYPE(in_pack[si]) - mean) * inv_variance;
+ OUTPUT_TYPE result;
+#if HAS_FUSED_OPS
+ FUSED_OPS;
+ result = FUSED_OPS_RESULT;
+#else
+ result = TO_OUTPUT_TYPE(normalized);
+#endif
+#if !OUTPUT_PAD_IN_ITEMS
+ DT_OUTPUT_BLOCK_WRITE(output, output_offset + si * SIMD, result);
+#else
+ uint x = output_spatial % OUTPUT_SIZE_X;
+ uint y = output_spatial / OUTPUT_SIZE_X;
+ output_offset = OUTPUT_GET_INDEX(b, f, y, x);
+ DT_OUTPUT_BLOCK_WRITE(output, output_offset, result);
+#endif
+ }
+ input_offset += GWS * FSV;
+ output_offset += GWS * FSV;
+ output_spatial_base += GWS;
+ }
+
+ // [constexpr] Number of leftovers after full local work-group iterations.
+ const uint lws_uniform_leftovers = ITEMS_NUM % GWS;
+ // [constexpr] Number of sub-groups that can process leftovers loading SIMD items.
+ const uint lws_uniform_leftovers_full_simds = lws_uniform_leftovers / SIMD;
+ // [constexpr] Number of leftovers after full sub-group processing.
+ const uint sg_uniform_leftovers = lws_uniform_leftovers % SIMD;
+
+ if (lws_uniform_leftovers_full_simds > 0 && sgid < lws_uniform_leftovers_full_simds) {
+ // Process leftovers that can use full sub-group.
+ INPUT_PACKED_TYPE in_pack = INPUT_PACKED_BLOCK_READ(input + input_offset);
+
+ __attribute__((opencl_unroll_hint))
+ for (uint si = 0; si < SIMD; ++si) {
+ uint output_spatial = output_spatial_base + si;
+ MEAN_TYPE normalized = (TO_MEAN_TYPE(in_pack[si]) - mean) * inv_variance;
+ OUTPUT_TYPE result;
+#if HAS_FUSED_OPS
+ FUSED_OPS;
+ result = FUSED_OPS_RESULT;
+#else
+ result = TO_OUTPUT_TYPE(normalized);
+#endif
+#if !OUTPUT_PAD_IN_ITEMS
+ DT_OUTPUT_BLOCK_WRITE(output, output_offset + si * SIMD, result);
+#else
+ uint x = output_spatial % OUTPUT_SIZE_X;
+ uint y = output_spatial / OUTPUT_SIZE_X;
+ output_offset = OUTPUT_GET_INDEX(b, f, y, x);
+ DT_OUTPUT_BLOCK_WRITE(output, output_offset, result);
+#endif
+ }
+ } else if (lws_uniform_leftovers > 0 &&
+ sg_uniform_leftovers > 0 &&
+ sgid == lws_uniform_leftovers_full_simds) {
+ // TODO: May be worth to consider the data here as across sub-group
+ // Rest of leftovers, still use whole sub-group, but change addresses to not load extra data.
+ INPUT_PACKED_TYPE in_pack;
+ uint pack_idx = 0;
+ if (sg_uniform_leftovers >= 8) {
+ INPUT_TYPE8 tmp_in = DT_INPUT_BLOCK_READ8(input, input_offset + pack_idx * SIMD);
+ in_pack[pack_idx + 0] = tmp_in[0];
+ in_pack[pack_idx + 1] = tmp_in[1];
+ in_pack[pack_idx + 2] = tmp_in[2];
+ in_pack[pack_idx + 3] = tmp_in[3];
+ in_pack[pack_idx + 4] = tmp_in[4];
+ in_pack[pack_idx + 5] = tmp_in[5];
+ in_pack[pack_idx + 6] = tmp_in[6];
+ in_pack[pack_idx + 7] = tmp_in[7];
+ pack_idx += 8;
+ }
+ if (sg_uniform_leftovers % 8 >= 4) {
+ INPUT_TYPE4 tmp_in = DT_INPUT_BLOCK_READ4(input, input_offset + pack_idx * SIMD);
+ in_pack[pack_idx + 0] = tmp_in[0];
+ in_pack[pack_idx + 1] = tmp_in[1];
+ in_pack[pack_idx + 2] = tmp_in[2];
+ in_pack[pack_idx + 3] = tmp_in[3];
+ pack_idx += 4;
+ }
+ if (sg_uniform_leftovers % 4 >= 2) {
+ INPUT_TYPE2 tmp_in = DT_INPUT_BLOCK_READ2(input, input_offset + pack_idx * SIMD);
+ in_pack[pack_idx + 0] = tmp_in[0];
+ in_pack[pack_idx + 1] = tmp_in[1];
+ pack_idx += 2;
+ }
+ if (sg_uniform_leftovers % 2 == 1) {
+ in_pack[pack_idx] = DT_INPUT_BLOCK_READ(input, input_offset + pack_idx * SIMD);
+ }
+
+ OUTPUT_PACKED_TYPE result;
+ __attribute__((opencl_unroll_hint))
+ for (uint si = 0; si < sg_uniform_leftovers; ++si) {
+ uint output_spatial = output_spatial_base + si;
+ MEAN_TYPE normalized = (TO_MEAN_TYPE(in_pack[si]) - mean) * inv_variance;
+ OUTPUT_TYPE result;
+#if HAS_FUSED_OPS
+ FUSED_OPS;
+ result = FUSED_OPS_RESULT;
+#else
+ result = TO_OUTPUT_TYPE(normalized);
+#endif
+#if !OUTPUT_PAD_IN_ITEMS
+ DT_OUTPUT_BLOCK_WRITE(output, output_offset + si * SIMD, result);
+#else
+ uint x = output_spatial % OUTPUT_SIZE_X;
+ uint y = output_spatial / OUTPUT_SIZE_X;
+ output_offset = OUTPUT_GET_INDEX(b, f, y, x);
+ DT_OUTPUT_BLOCK_WRITE(output, output_offset, result);
+#endif
+ }
+ }
+#else // => !OUTPUT_IS_FP
+ input_offset = data_sets_offset + sgid * SIMD * FSV;
+ uint output_offset = OUTPUT_GET_INDEX(b, f, 0, 0) + sgid * SIMD * FSV;
+ uint output_spatial = sgid * SIMD + sglid;
+
+ for (uint spatial_idx = 0; spatial_idx < ITEMS_NUM / GWS; ++spatial_idx) {
+ INPUT_PACKED_TYPE in_pack = ((const __global INPUT_PACKED_TYPE*)(input + input_offset))[sglid];
+
+ OUTPUT_PACKED_TYPE result;
+ __attribute__((opencl_unroll_hint))
+ for (uint set_idx = 0; set_idx < FSV; ++set_idx) {
+ MEAN_TYPE normalized = (TO_MEAN_TYPE(in_pack[set_idx]) - intel_sub_group_shuffle(mean, set_idx)) * intel_sub_group_shuffle(inv_variance, set_idx);
+ #if HAS_FUSED_OPS
+ FUSED_OPS;
+ result[set_idx] = FUSED_OPS_RESULT;
+ #else
+ result[set_idx] = TO_OUTPUT_TYPE(normalized);
+ #endif
+ }
+#if !OUTPUT_PAD_IN_ITEMS
+ ((__global OUTPUT_PACKED_TYPE*)(output + output_offset))[sglid] = result;
+#else
+ uint x = output_spatial % OUTPUT_SIZE_X;
+ uint y = output_spatial / OUTPUT_SIZE_X;
+ output_offset = OUTPUT_GET_INDEX(b, f, y, x);
+ ((__global OUTPUT_PACKED_TYPE*)(output + output_offset))[0] = result;
+#endif
+
+ input_offset += GWS * FSV;
+ output_offset += GWS * FSV;
+ output_spatial += GWS;
+ }
+
+ // [constexpr] Number of leftovers after full local work-group iterations.
+ const uint lws_uniform_leftovers = ITEMS_NUM % GWS;
+ // [constexpr] Number of sub-groups that can process leftovers loading SIMD items.
+ const uint lws_uniform_leftovers_full_simds = lws_uniform_leftovers / SIMD;
+ // [constexpr] Number of leftovers after full sub-group processing.
+ const uint sg_uniform_leftovers = lws_uniform_leftovers % SIMD;
+
+ if (lws_uniform_leftovers_full_simds > 0 && sgid < lws_uniform_leftovers_full_simds) {
+ // Process leftovers that can use full sub-group.
+ INPUT_PACKED_TYPE in_pack = ((const __global INPUT_PACKED_TYPE*)(input + input_offset))[sglid];
+
+ OUTPUT_PACKED_TYPE result;
+ __attribute__((opencl_unroll_hint))
+ for (uint set_idx = 0; set_idx < FSV; ++set_idx) {
+ MEAN_TYPE normalized = (TO_MEAN_TYPE(in_pack[set_idx]) - intel_sub_group_shuffle(mean, set_idx)) * intel_sub_group_shuffle(inv_variance, set_idx);
+ #if HAS_FUSED_OPS
+ FUSED_OPS;
+ result[set_idx] = FUSED_OPS_RESULT;
+ #else
+ result[set_idx] = TO_OUTPUT_TYPE(normalized);
+ #endif
+ }
+#if !OUTPUT_PAD_IN_ITEMS
+ ((__global OUTPUT_PACKED_TYPE*)(output + output_offset))[sglid] = result;
+#else
+ uint x = output_spatial % OUTPUT_SIZE_X;
+ uint y = output_spatial / OUTPUT_SIZE_X;
+ output_offset = OUTPUT_GET_INDEX(b, f, y, x);
+ ((__global OUTPUT_PACKED_TYPE*)(output + output_offset))[0] = result;
+#endif
+ } else if (lws_uniform_leftovers > 0 &&
+ sg_uniform_leftovers > 0 &&
+ sgid == lws_uniform_leftovers_full_simds) {
+ // TODO: May be worth to consider the data here as across sub-group
+ // Rest of leftovers, still use whole sub-group, but change addresses to not load extra data.
+ INPUT_PACKED_TYPE in_pack = ((const __global INPUT_PACKED_TYPE*)(input + input_offset))[sglid % sg_uniform_leftovers];
+
+ OUTPUT_PACKED_TYPE result;
+ __attribute__((opencl_unroll_hint))
+ for (uint set_idx = 0; set_idx < FSV; ++set_idx) {
+ MEAN_TYPE normalized = (TO_MEAN_TYPE(in_pack[set_idx]) - intel_sub_group_shuffle(mean, set_idx)) * intel_sub_group_shuffle(inv_variance, set_idx);
+ #if HAS_FUSED_OPS
+ FUSED_OPS;
+ result[set_idx] = FUSED_OPS_RESULT;
+ #else
+ result[set_idx] = TO_OUTPUT_TYPE(normalized);
+ #endif
+ }
+ if (sglid < sg_uniform_leftovers) {
+#if !OUTPUT_PAD_IN_ITEMS
+ ((__global OUTPUT_PACKED_TYPE*)(output + output_offset))[sglid] = result;
+#else
+ uint x = output_spatial % OUTPUT_SIZE_X;
+ uint y = output_spatial / OUTPUT_SIZE_X;
+ output_offset = OUTPUT_GET_INDEX(b, f, y, x);
+ ((__global OUTPUT_PACKED_TYPE*)(output + output_offset))[0] = result;
+#endif
+ }
+ }
+#endif
+}
+
+#endif
+// ================================================================================================
+
+#undef FSV
+#undef INPUT_SLICE_PITCH
+#undef SG_NUM
+
+#undef INPUT_TYPE2
+#undef INPUT_TYPE4
+#undef INPUT_TYPE8
+#undef INPUT_PACKED_TYPE
+#undef OUTPUT_PACKED_TYPE
+#undef INT_PACKED_TYPE
+#undef MEAN_PACKED_TYPE
+#undef TO_MEAN_PACKED_TYPE
+
+#undef INPUT_PACKED_BLOCK_READ
+#undef OUTPUT_PAD_IN_ITEMS
+
+#undef CEIL_DIV
+#undef USE_IMAD
--- /dev/null
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/data_types.cl"
+
+// ==============================================================================================================================
+// DECLARE_PACKED_ACCUMULATE(Name, AccT, InputT, SliceSize, SlicePitch, Items, Workers, AccOp)
+// DECLARE_PACKED_ACCUMULATE_EARGS(Name, AccT, InputT, SliceSize, SlicePitch, Items, Workers, AccOp, ExtraArgsDecl, ExtraArgs)
+//
+// Declares function "Name" performing parallel packed accumulation:
+// AccT<SliceSize> Name (const __global InputT* input, uint offset, uint worker_id ExtraArgsDecl)
+//
+// Template arguments:
+// Name - Name of function to declare.
+// AccT - Type of accumulator variable. Can't be vector type. Examples: int, float, half.
+// InputT - Type of input data. Can't be vector type. Examples: int, float, half.
+// SliceSize - Number values in packed slice to accumulate in each work-item. One of: 2, 4, 8, 16.
+// SlicePitch - Pitch between consecutive input slices in "input".
+// Items - Total number of items to accumulate across all work-items.
+// Workers - Number of work-items performing accumulation.
+// AccOp Name of operation used to perform accumulation.
+// Calling it "function-like" must return value of new accumulation variable.
+// Expected interface:
+// AccT AccOp(AccT current, InputT val, uint index ExtraArgs)
+// current - current accumulation value
+// val - currently processed input value
+// index - number of item inside slice currently processed
+// ExtraArgs - optional extra arguments passed as is from template argument
+// returns: new accumulator value after accumulating "val" with "current"
+// ExtraArgsDecl - Optional extra arguments declaration to pass to function.
+// ExtraArgs - Optional extra arguments to pass to "AccOp" using names declared in "ExtraArgsDecl".
+//
+// Function arguments:
+// input - Pointer to global memory from which values will be read to accumulate
+// offset - Offset into "input" from where accumulation should start
+// worker_id - Number of current work-item
+// ExtraArgsDecl - Optional extra arguments, declared from template argument.
+//
+// Pseduocode:
+// function Name(input, offset, worker_id, ExtraArgs... eargs) {
+// AccT<SliceSize> accumulator = 0;
+// for (uint idx = worker_id; idx < Items; idx += Workers) {
+// InputT<SliceSize> in = vload<SliceSize>(0, &input[offset + idx * SlicePitch];
+// for (uint si = 0; si < SliceSize; ++si) {
+// accumulator[si] = AccOp(accumulator[si], in[si], si, eargs...)
+// }
+// }
+// return accumulator;
+// }
+//
+// ==============================================================================================================================
+
+#define ACCUMULATE_SUM(a, b, idx) ((a) + (b))
+#define ACCUMULATE_SUM_SQ(a, b, idx) ((a) + ((b) * (b)))
+
+#define DECLARE_PACKED_ACCUMULATE_EARGS(Name, AccT, InputT, SliceSize, SlicePitch, Items, Workers, AccOp, ExtraArgsDecl, ExtraArgs) \
+inline MAKE_VECTOR_TYPE(AccT, SliceSize) FUNC(Name)(const __global InputT* input, \
+ uint offset, \
+ uint worker_id \
+ ExtraArgsDecl) { \
+ typedef MAKE_VECTOR_TYPE(InputT, SliceSize) packed_in_t; \
+ typedef MAKE_VECTOR_TYPE(AccT, SliceSize) packed_acc_t; \
+ \
+ packed_acc_t acc = 0; /* Accumulation variable */ \
+ \
+ uint input_offset = offset + worker_id * (SlicePitch); /* Current input offset */ \
+ \
+ /* Uniform loop to help compiler in unrolling */ \
+ for (uint spatial_idx = 0; spatial_idx < (Items) / (Workers); ++spatial_idx) { \
+ packed_in_t in_pack = ((const __global packed_in_t*)(input + input_offset))[0]; \
+ \
+ input_offset += (Workers) * (SlicePitch); \
+ \
+ __attribute__((opencl_unroll_hint)) \
+ for (uint set_idx = 0; set_idx < (SliceSize); ++set_idx) { \
+ acc[set_idx] = AccOp(acc[set_idx], in_pack[set_idx], set_idx ExtraArgs); \
+ } \
+ } \
+ \
+ /* [constexpr] Number of leftovers after all uniform iterations */ \
+ const uint leftovers = (Items) % (Workers); \
+ \
+ if (leftovers > 0 && worker_id < leftovers) { \
+ packed_in_t in_pack = ((const __global packed_in_t*)(input + input_offset))[0]; \
+ \
+ __attribute__((opencl_unroll_hint)) \
+ for (uint set_idx = 0; set_idx < (SliceSize); ++set_idx) { \
+ acc[set_idx] = AccOp(acc[set_idx], in_pack[set_idx], set_idx ExtraArgs); \
+ } \
+ } \
+ \
+ return acc; \
+}
+
+#define DECLARE_PACKED_ACCUMULATE(Name, AccT, InputT, SliceSize, SlicePitch, Items, Workers, AccOp) \
+ DECLARE_PACKED_ACCUMULATE_EARGS(Name, AccT, InputT, SliceSize, SlicePitch, Items, Workers, AccOp, , )
--- /dev/null
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/common.cl"
+
+// ==============================================================================================================================
+// DECLARE_WG_PACKED_REDUCE_ADD(Name, Type, VecSize, SgNum, PostOp)
+//
+// Declares function "Name" performing work-group reduction on vector data, using addition operator:
+// Type Name (Type<VecSize> value, __local Type* slm_acc)
+// Returns reduction result as sub-group vector, for example when VecSize equals 4:
+// work-item for which get_sub_group_local_id() == 0 will hold reduced values from value.s0
+// work-item for which get_sub_group_local_id() == 1 will hold reduced values from value.s1
+// work-item for which get_sub_group_local_id() == 2 will hold reduced values from value.s2
+// work-item for which get_sub_group_local_id() == 3 will hold reduced values from value.s2
+// for other work-items in sub-group the result will be undefined.
+// All work-items in sub-group must enter declared function.
+//
+// Template arguments:
+// Name - Name of function to declare.
+// Type - Type of values to reduce. Can't be vector type. Examples: int, float, half.
+// VecSize - Vector size of input, one of 2,4,8,16. Must be smaller or equal to sub-group size.
+// SgNum - Number of sub-groups inside work-group.
+// PostOp - Operation to perform on reduced values.
+// Called as PostOp(value), where "value" is reduction result, and call should evaluate to expression returning final result.
+//
+// Function arguments:
+// value - vector of "VecSize" elements of "Type" holding values to reduce.
+// slm_acc - pointer to local memory used for reduction. Must have size of at least ("SgNum" - 1) * "VecSize".
+//
+// Pseudocode:
+// function Name(value, slm_acc) {
+// Type result;
+// for (uint vi = 0; vi < VecSize; ++vi) {
+// Type tmp = work_group_reduce_add(value[vi]);
+// if (get_sub_group_local_id() == vi) {
+// result = tmp;
+// }
+// }
+// return result;
+// }
+//
+// Notes:
+// If local memory is going to be reused additiona barrier(CLK_LOCAL_MEM_FENCE) is required to ensure that all usage inside
+// declared function has finished.
+// ==============================================================================================================================
+
+#define REDUCE_NO_POST_OP(val) (val)
+
+#define DECLARE_WG_PACKED_REDUCE_ADD(Name, Type, VecSize, SgNum, PostOp) \
+ inline Type FUNC(Name) (MAKE_VECTOR_TYPE(Type, VecSize) value, __local Type* slm_acc) { \
+ typedef MAKE_VECTOR_TYPE(Type, VecSize) packed_t; \
+ \
+ Type result; \
+ \
+ /* [uniform] Current sub-groups id */ \
+ const uint sgid = get_sub_group_id(); \
+ /* Id of work-item inside sub-group */ \
+ const uint sglid = get_sub_group_local_id(); \
+ /* [constexpr] Maximum simd/sub-group size */ \
+ const uint simd = get_max_sub_group_size(); \
+ \
+ /* Accumulation inside sub-group */ \
+ packed_t acc; /* [uniform] Accumulator variable */ \
+ __attribute__((opencl_unroll_hint)) \
+ for (uint idx = 0; idx < VecSize; ++idx) { \
+ acc[idx] = sub_group_reduce_add(value[idx]); \
+ } \
+ if ((SgNum) != 1) { \
+ /* More than one sub-group in work-group, reduce using local memory */ \
+ /* Store partial results into local memory from sub-groups other than first one */ \
+ if (sgid != 0 && (sglid < VecSize || simd == VecSize)) { \
+ slm_acc[(sgid - 1) * VecSize + sglid] = acc[sglid]; \
+ } \
+ barrier(CLK_LOCAL_MEM_FENCE); \
+ /* Accumulate partial results inside first sub-group */ \
+ if (sgid == 0) { \
+ __attribute__((opencl_unroll_hint)) \
+ for (uint vi = 0; vi < VecSize; ++vi) { \
+ /* Accumulate single vector element using sub_group_reduce_add */ \
+ /* Last work-item inside sub-group holds previous value (iteration or sub-group reduction stage) */ \
+ \
+ Type tmp = acc[vi]; \
+ __attribute__((opencl_unroll_hint)) \
+ for (uint sg = 0; sg < (SgNum) - 1; sg += (simd - 1)) { \
+ bool last_sglid = sglid == simd - 1; \
+ bool sglid_inside_sgs = sg + simd - 1 <= (SgNum) - 1 || sg + sglid < (SgNum) - 1; \
+ Type tmp_in_slm = slm_acc[sg * VecSize + sglid * VecSize + vi]; \
+ tmp = last_sglid ? tmp : \
+ sglid_inside_sgs ? tmp_in_slm \
+ : 0; \
+ tmp = sub_group_reduce_add(tmp); \
+ } \
+ acc[vi] = tmp; \
+ } \
+ if (sglid < VecSize || simd == VecSize) { \
+ result = PostOp(acc[sglid]); \
+ slm_acc[sglid] = result; \
+ } \
+ } \
+ barrier(CLK_LOCAL_MEM_FENCE); \
+ /* Read result in all other sub-groups */ \
+ if (sgid != 0 && (sglid < VecSize || simd == VecSize)) { \
+ result = slm_acc[sglid]; \
+ } \
+ } else { \
+ /* Single sub-group case, just transpose the data to correct layout */ \
+ if (sglid < VecSize || simd == VecSize) { \
+ result = PostOp(acc[sglid]); \
+ slm_acc[sglid] = result; \
+ } \
+ } \
+ return result; \
+ }
#elif OUTPUT_LAYOUT_B_FS_YX_FSV16
const uint x = get_global_id(1);
const uint y = get_global_id(2);
- const uint bf = get_global_id(0);
+ const uint bf = (uint)get_global_id(0);
const uint f = bf / INPUT0_BATCH_NUM;
const uint b = bf % INPUT0_BATCH_NUM;
const uint z = 0;
#include "include/data_types.cl"
+#define INPUT_TYPE4 MAKE_VECTOR_TYPE(INPUT_REORDER_TYPE, 4)
+#define OUTPUT_TYPE4 MAKE_VECTOR_TYPE(OUTPUT_REORDER_TYPE, 4)
+
///////////////////////// Input Index /////////////////////////
inline uint FUNC(get_input_index)(uint b, uint f, uint w, uint z, uint y, uint x)
{
}
KERNEL (reorder_data)(
-#if defined INPUT0_LAYOUT_NV12
+#if INPUT0_LAYOUT_NV12 || INPUT0_LAYOUT_IMAGE_2D_RGBA
read_only image2d_t input,
#else
const __global INPUT_REORDER_TYPE* input,
#endif
+#if OUTPUT_LAYOUT_IMAGE_2D_RGBA
+ write_only image2d_t output
+#else
__global OUTPUT_REORDER_TYPE* output
+#endif
#ifdef MEAN_SUBTRACT_IN_BUFFER
, __global MEAN_SUBTRACT_TYPE* mean_subtract
#endif
#if defined INPUT0_LAYOUT_NV12
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST | CLK_ADDRESS_CLAMP;
float4 colorVYU = read_imagef(input, sampler, (int2)(x, y));
-
+
float Ycomponent = mad(colorVYU.s1, 296.82f, -18.624f);
float Ucomponent = mad(colorVYU.s2, 255.0f, -128.f);
float Vcomponent = mad(colorVYU.s0, 255.0f, -128.f);
float B = clamp(mad(Vcomponent, 1.596f, Ycomponent), 0.f, 255.f);
float R = clamp(mad(Ucomponent, 2.018f, Ycomponent), 0.f, 255.f);
float G = clamp(mad(Vcomponent, -0.813f, mad(Ucomponent, -0.391f, Ycomponent)), 0.f, 255.f);
-
+#elif defined INPUT0_LAYOUT_IMAGE_2D_RGBA
+ const sampler_t imageSampler = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST | CLK_ADDRESS_CLAMP;
+ OUTPUT_TYPE4 colorRGBA = IMAGE_READ(input, (int2)(x, y));
+#elif defined OUTPUT_LAYOUT_IMAGE_2D_RGBA
+ uint8 ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, f, w, z, y, x);
+ const uint input_idx_R = FUNC_CALL(get_input_index)(b, 0, w, z, y, x);
+ const uint input_idx_G = FUNC_CALL(get_input_index)(b, 1, w, z, y, x);
+ const uint input_idx_B = FUNC_CALL(get_input_index)(b, 2, w, z, y, x);
+#if OUTPUT_FEATURE_NUM == 3
+ INPUT_TYPE4 colorRGBA = { TO_INPUT_REORDER_TYPE(input[input_idx_R]), TO_INPUT_REORDER_TYPE(input[input_idx_G]), TO_INPUT_REORDER_TYPE(input[input_idx_B]), TO_INPUT_REORDER_TYPE(0.f) };
+#else
+ const uint input_idx_A = FUNC_CALL(get_input_index)(b, 3, w, z, y, x);
+ INPUT_TYPE4 colorRGBA = { TO_INPUT_REORDER_TYPE(input[input_idx_R]), TO_INPUT_REORDER_TYPE(input[input_idx_G]), TO_INPUT_REORDER_TYPE(input[input_idx_B]), TO_INPUT_REORDER_TYPE(input[input_idx_A]) };
+#endif
#else
uint8 ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, f, w, z, y, x);
const uint input_idx = FUNC_CALL(get_input_index)(b, f, w, z, y, x);
ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 2, w, z, y, x);
output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(B), NL_M, NL_N);
+#elif INPUT0_LAYOUT_IMAGE_2D_RGBA
+ uint8 ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 0, w, z, y, x);
+ uint output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
+ output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(colorRGBA.s0), NL_M, NL_N);
+ ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 1, w, z, y, x);
+ output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
+ output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(colorRGBA.s1), NL_M, NL_N);
+ ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 2, w, z, y, x);
+ output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
+ output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(colorRGBA.s2), NL_M, NL_N);
+#if INPUT0_FEATURE_NUM == 4
+ ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 3, w, z, y, x);
+ output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
+ output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(colorRGBA.s3), NL_M, NL_N);
+#endif
+#elif OUTPUT_LAYOUT_IMAGE_2D_RGBA
+ IMAGE_WRITE(output, (int2)(x, y), colorRGBA);
#else
#if INPUT0_IS_FP && !OUTPUT_IS_FP
// TODO: check if this round really needed. Right now it's added to have the same behavior as CPU plugin
#endif
#endif
}
+
+#undef INPUT_TYPE4
+#undef OUTPUT_TYPE4
#elif defined INPUT0_LAYOUT_GYXIO || \
defined INPUT0_LAYOUT_GOIYX
return GET_FILTER_GOIYX(INPUT0, g, o, i, y, x);
+#elif defined INPUT0_LAYOUT_OS_IS_YX_OSV16_ISV16
+ return GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(INPUT0, o, i, y, x);
+#elif defined INPUT0_LAYOUT_GS_OI_YXS_GSV16_YXSV4
+ return GET_FILTER_GS_OI_YXS_GSV16_YXSV4_INDEX(INPUT0, g, o, i, y, x);
+#elif defined INPUT0_LAYOUT_GS_OI_YXS_GSV32_YXSV4
+ return GET_FILTER_GS_OI_YXS_GSV32_YXSV4_INDEX(INPUT0, g, o, i, y, x);
#else
#error reorder_weights.cl: input format - not supported
#endif
return GET_FILTER_GS_OI_YXS_GSV4_YXSV4_INDEX(OUTPUT, g, o, i, y, x);
#elif defined OUTPUT_LAYOUT_G_OS_IS_YX_ISV16_OSV16
return GET_FILTER_G_OS_IS_YX_ISV16_OSV16_INDEX(OUTPUT, g, o, i, y, x, SUB_GROUP_SIZE);
+#elif defined OUTPUT_LAYOUT_OS_IS_YX_OSV16_ISV16
+ return GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(OUTPUT, o, i, y, x);
+#elif defined OUTPUT_LAYOUT_GS_OI_YXS_GSV16_YXSV4
+ return GET_FILTER_GS_OI_YXS_GSV16_YXSV4_INDEX(OUTPUT, g, o, i, y, x);
+#elif defined OUTPUT_LAYOUT_GS_OI_YXS_GSV32_YXSV4
+ return GET_FILTER_GS_OI_YXS_GSV32_YXSV4_INDEX(OUTPUT, g, o, i, y, x);
+#elif defined OUTPUT_LAYOUT_G_OS_IS_YX_OSV16_ISV4
+ return GET_FILTER_G_OS_IS_YX_OSV16_ISV4_INDEX(OUTPUT, g, o, i, y, x);
#else
#error reorder_weights.cl: output format - not supported
#endif
#endif
)
{
-#if defined(SAMPLE_TYPE_NEAREST)
+#if defined(SAMPLE_TYPE_NEAREST) && FEATURE_PACKED_MODE
+ typedef MAKE_VECTOR_TYPE(INPUT0_TYPE, PACK_SIZE) in_pack_t;
+ typedef MAKE_VECTOR_TYPE(OUTPUT_TYPE, PACK_SIZE) out_pack_t;
+
+ const int ox = get_global_id(0);
+ const int oy = get_global_id(1) % OUTPUT_SIZE_Y;
+ const int oz = get_global_id(1) / OUTPUT_SIZE_Y;
+ const int feature = (get_global_id(2) * PACK_SIZE) % OUTPUT_FEATURE_NUM;
+ const int batch = (get_global_id(2) * PACK_SIZE) / OUTPUT_FEATURE_NUM;
+ const int ix = floor(ox * X_RATIO);
+ const int iy = floor(oy * Y_RATIO);
+ const int iz = floor(oz * Z_RATIO);
+
+ uint input_idx = FUNC_CALL(get_input_index)(batch, feature, iz, iy, ix);
+ uint output_idx = FUNC_CALL(get_output_index)(batch, feature, oz, oy, ox);
+
+ in_pack_t interp_val_pack = ((const __global in_pack_t*)(input + input_idx))[0];
+ out_pack_t res;
+ unroll_for (uint pi = 0; pi < PACK_SIZE; ++pi) {
+ INPUT0_TYPE interp_val = interp_val_pack[pi];
+ #if HAS_FUSED_OPS
+ #define OF_ID (feature + pi)
+ FUSED_OPS;
+ res[pi] = FUSED_OPS_RESULT;
+ #else
+ res[pi] = ACTIVATION(interp_val, ACTIVATION_PARAMS);
+ #endif
+ }
+ ((__global out_pack_t*)(output + output_idx))[0] = res;
+
+#elif defined(SAMPLE_TYPE_NEAREST)
const int ox = get_global_id(0);
#if OUTPUT_DIMS <= 4
const int oy = get_global_id(1);
const int oy = get_global_id(1);
const int feature = 0;
const int batch = get_global_id(2);
- const INPUT0_TYPE ix = TO_INPUT0_TYPE(X_RATIO) * ox;
- const INPUT0_TYPE iy = TO_INPUT0_TYPE(Y_RATIO) * oy;
+ const float ix = X_RATIO * ox;
+ const float iy = Y_RATIO * oy;
#ifdef LEFTOVERS
if (ox >= OUTPUT_SIZE_X)
return;
#endif
- const int top_y_index = (int)(floor(iy));
- const int bottom_y_index = (int)(min(ceil(iy), TO_INPUT0_TYPE(INPUT0_SIZE_Y) - 1));
- const int left_x_index = (int)(floor(ix));
- const int right_x_index = (int)(min(ceil(ix), TO_INPUT0_TYPE(INPUT0_SIZE_X) - 1));
+ const int top_y_index = (int)(floor(iy));
+ const int bottom_y_index = (int)(min(TO_INPUT0_TYPE(ceil(iy)), TO_INPUT0_TYPE(INPUT0_SIZE_Y) - 1));
+ const int left_x_index = (int)(floor(ix));
+ const int right_x_index = (int)(min(TO_INPUT0_TYPE(ceil(ix)), TO_INPUT0_TYPE(INPUT0_SIZE_X) - 1));
- const INPUT0_TYPE dx = ix - left_x_index;
- const INPUT0_TYPE dy = iy - top_y_index;
+ const INPUT0_TYPE dx = TO_INPUT0_TYPE(ix - left_x_index);
+ const INPUT0_TYPE dy = TO_INPUT0_TYPE(iy - top_y_index);
- unroll_for (int in_f = 0; in_f < OUTPUT_FEATURE_NUM; in_f++) {
- INPUT0_TYPE top_left = input[INPUT0_GET_INDEX(batch, in_f, top_y_index, left_x_index)];
- INPUT0_TYPE top_right = input[INPUT0_GET_INDEX(batch, in_f, top_y_index, right_x_index)];
- INPUT0_TYPE bottom_left = input[INPUT0_GET_INDEX(batch, in_f, bottom_y_index, left_x_index)];
+ unroll_for(int in_f = 0; in_f < OUTPUT_FEATURE_NUM; in_f++) {
+ INPUT0_TYPE top_left = input[INPUT0_GET_INDEX(batch, in_f, top_y_index, left_x_index)];
+ INPUT0_TYPE top_right = input[INPUT0_GET_INDEX(batch, in_f, top_y_index, right_x_index)];
+ INPUT0_TYPE bottom_left = input[INPUT0_GET_INDEX(batch, in_f, bottom_y_index, left_x_index)];
INPUT0_TYPE bottom_right = input[INPUT0_GET_INDEX(batch, in_f, bottom_y_index, right_x_index)];
- INPUT0_TYPE top = top_left + (top_right - top_left) * dx;
+ INPUT0_TYPE top = top_left + (top_right - top_left) * dx;
INPUT0_TYPE bottom = bottom_left + (bottom_right - bottom_left) * dx;
INPUT0_TYPE interp_val = top + (bottom - top) * dy;
} else if (_tensor.LogicalSize() == _tensor.Feature().v) {
// We support broadcast only if corresponding dimension is equal to 1.
// Otherwise, dimensions should be equal and using "f" should be safe.
- if (_tensor.PitchesDifferFromLogicalDims()) {
+ if (_tensor.PitchesDifferFromLogicalDims() && _tensor.SimpleLayout()) {
std::string f_pitch = std::to_string(_tensor.Feature().pitch);
definitions.push_back({ safe_index_func_name, "(" + offset + " + (f) * " + f_pitch + ")" });
definitions.push_back({ index_func_name, "(" + offset + " + (f) * " + f_pitch + ")" });
+ } else if (_tensor.PitchesDifferFromLogicalDims()) {
+ // TODO This should be solved differently, by setting the macro arguments to zero
+ definitions.push_back({ safe_index_func_name, safe_index_func_val });
+ definitions.push_back({ index_func_name, index_func_val });
} else {
definitions.push_back({ safe_index_func_name, "f" });
definitions.push_back({ index_func_name, "f" });
res = '{{"{}",\n(std::string) R"__krnl(\n'.format(kernel_name)
content = self.append_file_content(filename, filename)
max_lines = 200
+ max_characters = 16350
+ characters = 1 # Newline character above
for i, line in enumerate(content.split('\n')):
- if i % max_lines == 0:
+ if (i + 1) % max_lines == 0 or characters + len(line) + 1 > max_characters:
res += ')__krnl"\n + R"__krnl('
+ characters = 0
res += line + '\n'
+ characters += len(line) + 1
- res += ')__krnl"}},\n\n'.format(kernel_name, self.append_file_content(filename, filename))
+ res += ')__krnl"}},\n\n'.format(kernel_name)
return res
for (auto& c : conf) {
std::string fused_ops;
std::string fused_ops_preload;
- std::string fused_ops_calc_only;
+ std::string fused_ops_calc;
std::string in_name = c.input_var_name;
Datatype in_type = c.input_dt;
-
- bool can_use_preload = true;
+ bool can_all_use_preload = true;
for (size_t i = 0; i < params.fused_ops.size(); i++) {
auto fused_dep_codegen = FusedOpsCodeGenerator(params.fused_ops[i]);
in_name = out_var;
in_type = out_type;
- can_use_preload &= fused_dep_codegen.CanPreloadData(c);
+ bool can_use_preload = fused_dep_codegen.CanPreloadData(c);
+ can_all_use_preload &= can_use_preload;
fused_ops += "\\\n\tFUSED_OP" + std::to_string(i) + "_LOAD" + c.suffix;
fused_ops += "\\\n\tFUSED_OP" + std::to_string(i) + "_ACTION" + c.suffix;
- fused_ops_preload += "\\\n\tFUSED_OP" + std::to_string(i) + "_LOAD" + c.suffix;
- fused_ops_calc_only += "\\\n\tFUSED_OP" + std::to_string(i) + "_ACTION" + c.suffix;
+ if (can_use_preload)
+ fused_ops_preload += "\\\n\tFUSED_OP" + std::to_string(i) + "_LOAD" + c.suffix;
+ if (c.allow_for_partial_preload && !can_use_preload)
+ fused_ops_calc += "\\\n\tFUSED_OP" + std::to_string(i) + "_LOAD" + c.suffix;
+ fused_ops_calc += "\\\n\tFUSED_OP" + std::to_string(i) + "_ACTION" + c.suffix;
}
jit.AddConstant(MakeJitConstant("FUSED_OPS" + c.suffix, fused_ops));
jit.AddConstant(MakeJitConstant("FUSED_OPS_PRELOAD" + c.suffix, fused_ops_preload));
- jit.AddConstant(MakeJitConstant("FUSED_OPS_CALC" + c.suffix, fused_ops_calc_only));
+ jit.AddConstant(MakeJitConstant("FUSED_OPS_CALC" + c.suffix, fused_ops_calc));
jit.AddConstant(MakeJitConstant("FUSED_OPS_RESULT" + c.suffix, in_name));
- jit.AddConstant(MakeJitConstant("FUSED_OPS_CAN_USE_PRELOAD" + c.suffix, can_use_preload));
+ bool can_any_use_preload = !fused_ops_preload.empty();
+ jit.AddConstant(MakeJitConstant("FUSED_OPS_CAN_USE_PRELOAD" + c.suffix,
+ can_all_use_preload || (c.allow_for_partial_preload && can_any_use_preload)));
}
jit.Merge(MakeFusedOpsDeclsJitConstants(params, conf));
case kernel_selector::DataLayout::bs_fs_yx_bsv16_fsv16: return "BS_FS_YX_BSV16_FSV16";
case kernel_selector::DataLayout::bs_fs_zyx_bsv16_fsv16: return "BS_FS_ZYX_BSV16_FSV16";
case kernel_selector::DataLayout::nv12: return "NV12";
+ case kernel_selector::DataLayout::image_2d_rgba: return "IMAGE_2D_RGBA";
default:
return "";
}
}
std::string toString(WeightsLayout layout) {
- switch (layout) {
+ switch (layout) {
case WeightsLayout::oi: return "OI";
case WeightsLayout::io: return "IO";
case WeightsLayout::oiyx: return "OIYX";
case WeightsLayout::iyxo: return "IYXO";
case WeightsLayout::yxio: return "YXIO";
case WeightsLayout::os_is_yx_isv16_osv16: return "OS_IS_YX_ISV16_OSV16";
+ case WeightsLayout::os_is_yx_osv16_isv16: return "OS_IS_YX_OSV16_ISV16";
case WeightsLayout::os_iyx_osv16: return "OS_IYX_OSV16";
case WeightsLayout::os_iyx_osv32: return "OS_IYX_OSV32";
case WeightsLayout::os_iyx_osv32__ai32: return "OS_IYX_OSV32__AI32";
case WeightsLayout::giy_xs_os_xsv2_osv16__ao32: return "GIY_XS_OS_XSV2_OSV16__AO32";
case WeightsLayout::giy_xs_os_xsv2_osv8__ao32: return "GIY_XS_OS_XSV2_OSV8__AO32";
case WeightsLayout::gs_oi_yxs_gsv4_yxsv4: return "GS_OI_YXS_GSV4_YXSV4";
+ case WeightsLayout::gs_oi_yxs_gsv16_yxsv4: return "GS_OI_YXS_GSV16_YXSV4";
+ case WeightsLayout::gs_oi_yxs_gsv32_yxsv4: return "GS_OI_YXS_GSV32_YXSV4";
case WeightsLayout::g_os_is_yx_isv16_osv16: return "G_OS_IS_YX_ISV16_OSV16";
+ case WeightsLayout::g_os_is_yx_osv16_isv4: return "G_OS_IS_YX_OSV16_ISV4";
default: throw std::invalid_argument("Failed to convert WeightsLayout " + std::to_string(layout) + " to string");
}
}
}
void ParamsKey::EnableFusedConvEltwiseRWOutOpt() { key.restrict.val.dedicated.fused_conv_eltw.rw_out_opt = 1; }
+void ParamsKey::EnableFusedConvEltwDepthToSpaceFusing() { key.restrict.val.dedicated.fused_conv_eltw.depth_to_space_fused = 1; }
void ParamsKey::EnableQuantization(QuantizationType q) {
k.EnableSubGroupShort();
}
+ if (engineInfo.bSubGroupCharSupport) {
+ k.EnableSubGroupChar();
+ }
+
return k;
}
uint32_t stride : 1;
// fused conv eltw
uint32_t rw_out_opt : 1;
+ uint32_t depth_to_space_fused : 1;
} fused_conv_eltw;
struct quantize_t {
uint32_t packed_binary_output : 1;
struct val_t {
uint32_t subgroup : 1;
uint32_t subgroupShort : 1;
+ uint32_t subgroupChar : 1;
} val;
uint32_t raw;
} machineInfo;
void EnableGradient() { key.restrict.val.gradient = 1; }
void EnableSubGroup() { key.machineInfo.val.subgroup = 1; }
void EnableSubGroupShort() { key.machineInfo.val.subgroupShort = 1; }
+ void EnableSubGroupChar() { key.machineInfo.val.subgroupChar = 1; }
void EnableNonBiasTerm() { key.restrict.val.nonBias = 1; }
void EnableBiasPerFeature() { key.restrict.val.biasPerFeatureMap = 1; }
void EnableBiasPerOutput() { key.restrict.val.biasPerOutput = 1; }
void EnableFusedConvEltwInt8Quantization() { key.restrict.val.dedicated.fused_conv_eltw.quantization = 1; }
void EnableFusedConvEltwOutputCalibration() { key.restrict.val.dedicated.fused_conv_eltw.calibration = 1; }
void EnableFusedConvEltwEltwiseStride();
+ void EnableFusedConvEltwDepthToSpaceFusing();
void EnableQuantizePackedBinaryOutput() { key.restrict.val.dedicated.quantize.packed_binary_output = 1; }
void EnableQuantizeScaleShiftOpt() { key.restrict.val.dedicated.quantize.scale_shift_opt = 1; }
struct EngineInfo {
bool bSubGroupSupport = false;
bool bSubGroupShortSupport = false;
+ bool bSubGroupCharSupport = false;
bool bFP16Support = false;
bool bFP64Support = false;
bool bImageSupport = false;
IndexType index_type;
// Defines outer loops channels where fused op is called.
std::vector<Tensor::DataChannelName> loop_axes;
+ // If allow_for_partial_preload is false, then it's required that all fused_ops can be preloaded.
+ // If allow_for_partial_preload is true, then not preloaded fused_ops will be loaded in FUSED_OPS_CALC.
+ bool allow_for_partial_preload;
FusedOpsConfiguration(std::string suffix,
std::vector<std::string> bfzyx_idx_order,
BoundaryCheck boundary_check = BoundaryCheck::ENABLED,
IndexType index_type = IndexType::TENSOR_COORD,
Tensor::DataChannelName vec_axis = Tensor::DataChannelName::COUNT,
- std::vector<Tensor::DataChannelName> loop_axes = {})
+ std::vector<Tensor::DataChannelName> loop_axes = {},
+ bool allow_for_partial_preload = false)
: suffix(suffix)
, bfzyx_idx_order(bfzyx_idx_order)
, input_var_name(input_var_name)
, load_type(load_type)
, boundary_check(boundary_check)
, index_type(index_type)
- , loop_axes(loop_axes) { }
+ , loop_axes(loop_axes)
+ , allow_for_partial_preload(allow_for_partial_preload) { }
FusedOpsConfiguration& SetVectorSize(size_t val) { vec_size = val; return *this; }
FusedOpsConfiguration& SetLoadType(LoadType val) { load_type = val; return *this; }
FusedOpsConfiguration& SetBoundaryCheck(BoundaryCheck val) { boundary_check = val; return *this; }
FusedOpsConfiguration& SetIndexType(IndexType val) { index_type = val; return *this; }
FusedOpsConfiguration& SetVectorAxis(Tensor::DataChannelName val) { vec_axis = val; return *this; }
- FusedOpsConfiguration& SetLoopAxes(std::vector<Tensor::DataChannelName> val) { loop_axes = std::move(val); return *this; }
+ FusedOpsConfiguration& SetLoopAxes(std::vector<Tensor::DataChannelName> val, bool partial_preload = false) {
+ loop_axes = std::move(val);
+ allow_for_partial_preload = partial_preload;
+ return *this; }
};
// Instance of fused_operation_desc is added to fused_ops vector if a node has been fused to current one using program_impl::fuse_nodes
namespace cldnn {
-void err_details::cldnn_print_error_message(const std::string& file,
- int line,
+void err_details::cldnn_print_error_message(
+#ifndef NDEBUG
+ const std::string& file, int line,
+#else
+ const std::string&, int,
+#endif
const std::string& instance_id,
std::stringstream& msg,
const std::string& add_msg) {
#ifndef NDEBUG
source_of_error << file << " at line: " << line << std::endl;
-#else
- (void)file;
- (void)line;
#endif
source_of_error << "Error has occured for: " << instance_id << std::endl;
"expected size of batch",
1,
"Biases isn't 1D vector.");
- CLDNN_ERROR_NOT_EQUAL(node.id(),
- "Bias feature[0]",
- bias_inst.size.feature[0],
- "expected feature map number",
- output_size.feature[0] / split,
- "Bias/fm mismatch");
+
+ if (node.get_output_layout().format != format::image_2d_rgba) {
+ CLDNN_ERROR_NOT_EQUAL(node.id(),
+ "Bias feature[0]",
+ bias_inst.size.feature[0],
+ "expected feature map number",
+ output_size.feature[0] / split,
+ "Bias/fm mismatch");
+ }
CLDNN_ERROR_NOT_EQUAL(node.id(),
"Bias spatial[1]",
bias_inst.size.spatial[1],
// block f16 format
{std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv16), concatenation_gpu::create},
{std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv16), concatenation_gpu::create},
+ {std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), concatenation_gpu::create},
+ {std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), concatenation_gpu::create},
// MMAD
{std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), concatenation_gpu::create},
{std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), concatenation_gpu::create},
// block f16 format
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv16), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv16), val_fw);
+ // block i8 format
+ implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), val_fw);
+ implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), val_fw);
// MMAD
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf_af32), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf_af32), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byx8_f4), val_fw);
+ implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), val_fw);
+ implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv32), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv32), val_fw);
// block f16
{ std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv16), eltwise_gpu::create },
{ std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv16), eltwise_gpu::create },
+ { std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), eltwise_gpu::create },
+ { std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), eltwise_gpu::create },
// 3D
{ std::make_tuple(engine_types::ocl, data_types::f32, format::bfzyx), eltwise_gpu::create },
{ std::make_tuple(engine_types::ocl, data_types::f16, format::bfzyx), eltwise_gpu::create },
{std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), val_fw},
{std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), val_fw},
{std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv4), val_fw},
+ {std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), val_fw},
+ {std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), val_fw},
// fs_b_yx_fsv32
{std::make_tuple(engine_types::ocl, data_types::f16, format::fs_b_yx_fsv32), val_fw},
});
const auto transposed = arg.get_transposed();
- assert(arg.get_output_layout().size.feature[0] == weights_layout.size.batch[0] * weights_layout.size.group[0]);
+ if (arg.get_fused_primitives().empty() || !(arg.get_fused_primitives().begin()->node->is_type<depth_to_space>()))
+ assert(arg.get_output_layout().size.feature[0] == weights_layout.size.batch[0] * weights_layout.size.group[0]);
// conv params
auto fused_params =
fused_params.non_conv_scale = primitive->non_conv_scale;
fused_params.second_input_in_output = primitive->second_input_in_output;
+ fused_params.depth_to_space_already_fused = primitive->depth_to_space_already_fused;
conv_params.local_convolution = weights_size.local[0] > 1 || weights_size.local[1] > 1;
conv_params.split = split;
fused_conv_eltwise_gpu::create);
implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32),
fused_conv_eltwise_gpu::create);
+ implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::image_2d_rgba),
+ fused_conv_eltwise_gpu::create);
}
} // namespace detail
_height = layout.size.spatial[0] * layout.size.feature[0] * layout.size.spatial[1];
order = CL_RGBA;
break;
+ case format::image_2d_rgba:
+ _width = layout.size.spatial[0];
+ _height = layout.size.spatial[1];
+ order = CL_RGBA;
+ if (layout.size.feature[0] != 3 && layout.size.feature[0] != 4) {
+ CLDNN_ERROR_MESSAGE("2D image allocation", "invalid number of channels in image_2d_rgba input image (should be 3 or 4)!");
+ }
+ type = CL_UNORM_INT8;
+ break;
case format::nv12:
_width = layout.size.spatial[1];
_height = layout.size.spatial[0];
const shared_mem_params* params,
uint32_t net_id)
: gpu_image2d(engine, new_layout,
- cl::ImageVA(engine->get_context()->context(), CL_MEM_READ_ONLY,
+ cl::ImageVA(engine->get_context()->context(), CL_MEM_READ_WRITE,
params->surface, params->plane),
net_id),
device(params->user_device),
mvn_gpu::create);
implementation_map<mvn>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_zyx_bsv16_fsv16),
mvn_gpu::create);
+ implementation_map<mvn>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv16),
+ mvn_gpu::create);
+ implementation_map<mvn>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv16),
+ mvn_gpu::create);
+ implementation_map<mvn>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16),
+ mvn_gpu::create);
+ implementation_map<mvn>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16),
+ mvn_gpu::create);
}
} // namespace detail
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), pooling_gpu::create);
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf), pooling_gpu::create);
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf), pooling_gpu::create);
- // block fsv16 format
+ // block fp16 format
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv16), pooling_gpu::create);
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv16), pooling_gpu::create);
+ // block i8 format
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), pooling_gpu::create);
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), pooling_gpu::create);
// 3D
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::fs_b_yx_fsv32), val_fw);
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv16), val_fw);
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv16), val_fw);
+ implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), val_fw);
+ implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), val_fw);
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf_af32), val_fw);
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf_af32), val_fw);
implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv16), val_fw);
implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv16), val_fw);
+ implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), val_fw);
+ implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), val_fw);
+
implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_zyx_fsv16), val_fw);
implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_zyx_fsv16), val_fw);
implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_zyx_bsv16_fsv16), val_fw);
params.shrink_axis_mask = arg.get_primitive()->shrink_axis_mask;
pad_vector_to_size(params.shrink_axis_mask, dims_num, 0);
- std::vector<size_t> logical_dims = params.output.LogicalDims();
+ std::vector<size_t> logical_dims = params.inputs[0].LogicalDims();
+ std::reverse(logical_dims.begin(), logical_dims.end()); // get dims in bfyx order
std::vector<int32_t> out_shape;
for (const auto& dim : logical_dims)
out_shape.push_back(static_cast<int32_t>(dim));
// instead.
vector_assign_if_not_mask(params.striding_params[1], out_shape, params.end_mask);
+ for (size_t dim = 0; dim < params.striding_params[2].size(); dim++) {
+ auto begin = params.striding_params[0][dim] < 0 ? out_shape[dim] + params.striding_params[0][dim] : params.striding_params[0][dim];
+ auto end = params.striding_params[1][dim] < 0 ? out_shape[dim] + params.striding_params[1][dim] : params.striding_params[1][dim];
+ auto stride = params.striding_params[2][dim];
+ if (stride < 0 && (end > begin)) {
+ std::swap(params.striding_params[0][dim], params.striding_params[1][dim]);
+ params.striding_params[0][dim] = params.striding_params[0][dim] - 1;
+ }
+ }
+
auto& kernel_selector = kernel_selector::strided_slice_kernel_selector::Instance();
auto best_kernels = kernel_selector.GetBestKernels(params, op_params);
#include "lstm_inst.h"
#include "reshape_inst.h"
#include "resample_inst.h"
+#include "permute_inst.h"
+#include "depth_to_space_inst.h"
#include "lstm_dynamic_inst.h"
#include "lstm_dynamic_input_inst.h"
#include "lstm_dynamic_timeloop_inst.h"
#include "mutable_data_inst.h"
#include "arg_max_min_inst.h"
+#include "kernel_selector_utils.h"
#include <iomanip>
#include <string>
p.nodes_map.erase(node->id());
continue;
}
+
+ // find sequence reshape->permute->reshape and exchange with depth to space
+ if (node->is_type<reshape>()) {
+ if (!p.get_options().get<build_option_type::optimize_data>()->enabled())
+ continue;
+
+ if (node->get_users().size() == 0)
+ continue;
+
+ auto& input_node = node->get_dependency(0);
+ if (!(node->get_users().front()->is_type<permute>()) || !(input_node.is_type<reorder>()))
+ continue;
+
+ auto input_node_layout = input_node.get_output_layout();
+ if (input_node_layout.format != format::bfwzyx || input_node_layout.data_type != data_types::f16)
+ continue;
+
+ // optimal implementation only for depth to space block size 2
+ auto reshape1_layout = node->get_output_layout();
+ if (reshape1_layout.size.spatial[3] != 2)
+ continue;
+
+ auto permute_prim = node->get_users().front()->as<permute>().typed_desc();
+ primitive_id permute_id = node->get_users().front()->id();
+ auto& permute_node = node->get_users().front();
+
+ auto reshape1_prim = node->as<reshape>().typed_desc();
+ primitive_id reshape1_id = node->id();
+
+ p.remove_connection(*node, *permute_node);
+
+ auto perm_node_ptr = p.nodes_map.find(permute_id)->second;
+ auto perm_node = &perm_node_ptr->as<permute>();
+
+ auto rename_id = permute_id + "_tmp";
+ p.rename(*perm_node, rename_id);
+
+ auto reorder_id = input_node.id() + "_reorder_for_depth_to_space";
+ auto reorder_prim = std::make_shared<reorder>(reorder_id, input_node.id(), format::bfyx, input_node_layout.data_type);
+ auto pixel_shuffle_prim = std::make_shared<depth_to_space>(permute_id, reorder_id, 2);
+
+ p.get_or_create(reorder_prim);
+ p.get_or_create(pixel_shuffle_prim);
+ auto reorder_depth_node_ptr = p.nodes_map.find(reorder_id)->second;
+ auto pixel_shuffle_node_ptr = p.nodes_map.find(permute_id)->second;
+ p.add_connection(input_node, *reorder_depth_node_ptr);
+ p.add_connection(*reorder_depth_node_ptr, *pixel_shuffle_node_ptr);
+
+ auto deconv_node_ptr = p.nodes_map.find(rename_id)->second;
+ p.replace_all_usages(*deconv_node_ptr, *pixel_shuffle_node_ptr);
+ p.optimized_out.push_back(rename_id);
+ p.nodes_map.erase(rename_id);
+
+ p.remove_connection(input_node, *node);
+ p.replace_all_usages(*node, input_node);
+ p.optimized_out.push_back(reshape1_id);
+ p.nodes_map.erase(reshape1_id);
+ continue;
+ }
}
}
#include "convolution_inst.h"
#include "deconvolution_inst.h"
+#include "depth_to_space_inst.h"
+#include "kernel_selector_utils.h"
#include <vector>
#include <list>
#include <memory>
continue;
auto deconv_prim = node->as<deconvolution>().typed_desc();
-
- // limit optimization to stride = 1
- if (deconv_prim->stride.spatial[0] != 1 || deconv_prim->stride.spatial[1] != 1 || deconv_prim->gradient())
- continue;
-
- primitive_id deconv_id = node->id();
- auto& input_node = node->get_dependency(0);
-
- // disable for 5D
- if (cldnn::format::dimension(input_node.get_output_layout().format) == 5)
- continue;
-
- // Disable for blocked formats
- if ((_lo.get_optimization_attributes().b_fs_yx_fsv16_network || input_node.get_output_layout().format == format::b_fs_yx_fsv16) &&
- _lo.is_format_optimized(node->as<deconvolution>(), format::b_fs_yx_fsv16)) {
- continue;
- }
-
-
- primitive_id input_id = deconv_prim->input[0];
-
- // setting convolution parameters based on deconvolution params
- auto stride = deconv_prim->stride;
+ tensor filter_size = { 1, 1, 1, 1, 1 };
auto weights = deconv_prim->weights;
+
std::vector<primitive_id> weights_vec;
- for (auto& weights_id : weights) weights_vec.push_back(weights_id);
- auto biases = deconv_prim->bias;
- std::vector<primitive_id> bias_vec;
- for (auto& bias_id : biases) bias_vec.push_back(bias_id);
- auto input_offset = deconv_prim->input_offset;
- auto output_padding = deconv_prim->output_padding;
-
- // remove deconvolution node and its connections to weights and biases, rename it and move to the optimized
- // list
- tensor filter_size = {1, 1, 1, 1, 1};
- p.remove_connection(node->get_dependency(0), *node);
+ for (auto& weights_id : weights)
+ weights_vec.push_back(weights_id);
+
for (auto& weights_id : weights_vec) {
auto weights_iter = p.nodes_map.find(weights_id);
if (weights_iter == p.nodes_map.end()) continue;
auto weights_node_ptr = weights_iter->second;
- p.remove_connection(*weights_node_ptr, *node);
- // get filter spatial sizes for input offset adjustment, perform this only once as all filters shouls
+ // get filter spatial sizes for input offset adjustment, perform this only once as all filters should
// have same size
if (weights_id == weights_vec[0])
filter_size = weights_node_ptr->get_output_layout().size;
}
- input_offset.spatial[0] = std::abs(input_offset.spatial[0]) - (filter_size.spatial[0] - 1);
- input_offset.spatial[1] = std::abs(input_offset.spatial[1]) - (filter_size.spatial[1] - 1);
- input_offset.spatial[2] = std::abs(input_offset.spatial[2]) - (filter_size.spatial[2] - 1);
+ // limit optimization to stride = 1
+ if (deconv_prim->stride.spatial[0] == 1 && deconv_prim->stride.spatial[1] == 1 && !deconv_prim->gradient()) {
+ primitive_id deconv_id = node->id();
+ auto& input_node = node->get_dependency(0);
- if (!bias_vec.empty()) {
- for (auto& bias_id : bias_vec) {
- auto bias_iter = p.nodes_map.find(bias_id);
- if (bias_iter == p.nodes_map.end()) continue;
+ // disable for 5D
+ if (cldnn::format::dimension(input_node.get_output_layout().format) == 5)
+ continue;
- auto bias_id_node_ptr = bias_iter->second;
- p.remove_connection(*bias_id_node_ptr, *node);
+ // Disable for blocked formats
+ if ((_lo.get_optimization_attributes().b_fs_yx_fsv16_network || input_node.get_output_layout().format == format::b_fs_yx_fsv16) &&
+ _lo.is_format_optimized(node->as<deconvolution>(), format::b_fs_yx_fsv16)) {
+ continue;
}
- }
- auto rename_id = deconv_id + "_tmp";
- p.rename(*node, rename_id);
-
- // create convolution primitive
- if (biases.size() != 0) {
- auto conv_prim = std::make_shared<convolution>(deconv_id,
- input_id,
- weights_vec,
- bias_vec,
- stride,
- input_offset,
- tensor{1, 1, 1, 1},
- output_padding);
- p.get_or_create(conv_prim);
- } else {
- auto conv_prim = std::make_shared<convolution>(deconv_id,
- input_id,
- weights_vec,
- stride,
- input_offset,
- tensor{1, 1, 1, 1},
- output_padding);
- p.get_or_create(conv_prim);
- }
- auto conv_node_itr = p.nodes_map.find(deconv_id);
- if (conv_node_itr == p.nodes_map.end()) continue;
+ primitive_id input_id = deconv_prim->input[0];
- auto conv_node_ptr = conv_node_itr->second;
- auto conv_node = &conv_node_ptr->as<convolution>();
- conv_node->set_transposed(true);
+ // setting convolution parameters based on deconvolution params
+ auto stride = deconv_prim->stride;
+ auto biases = deconv_prim->bias;
+ std::vector<primitive_id> bias_vec;
+ for (auto& bias_id : biases) bias_vec.push_back(bias_id);
+ auto input_offset = deconv_prim->input_offset;
+ auto output_padding = deconv_prim->output_padding;
- // add connections input->convolution, weights->convolution and bias->convolution
- p.add_connection(input_node, *conv_node_ptr);
+ // remove deconvolution node and its connections to weights and biases, rename it and move to the optimized
+ // list
+ p.remove_connection(node->get_dependency(0), *node);
+ for (auto& weights_id : weights_vec) {
+ auto weights_iter = p.nodes_map.find(weights_id);
+ if (weights_iter == p.nodes_map.end()) continue;
- for (auto& weights_id : weights_vec) {
- auto weights_node_itr = p.nodes_map.find(weights_id);
- if (weights_node_itr == p.nodes_map.end()) continue;
+ auto weights_node_ptr = weights_iter->second;
+ p.remove_connection(*weights_node_ptr, *node);
+ }
- auto weights_node_ptr = weights_node_itr->second;
- p.add_connection(*weights_node_ptr, *conv_node_ptr);
- }
+ input_offset.spatial[0] = std::abs(input_offset.spatial[0]) - (filter_size.spatial[0] - 1);
+ input_offset.spatial[1] = std::abs(input_offset.spatial[1]) - (filter_size.spatial[1] - 1);
+ input_offset.spatial[2] = std::abs(input_offset.spatial[2]) - (filter_size.spatial[2] - 1);
- if (!bias_vec.empty()) {
- for (auto& bias_id : bias_vec) {
- auto bias_id_node_itr = p.nodes_map.find(bias_id);
- if (bias_id_node_itr == p.nodes_map.end()) continue;
+ if (!bias_vec.empty()) {
+ for (auto& bias_id : bias_vec) {
+ auto bias_iter = p.nodes_map.find(bias_id);
+ if (bias_iter == p.nodes_map.end()) continue;
- auto bias_id_node_ptr = bias_id_node_itr->second;
- p.add_connection(*bias_id_node_ptr, *conv_node_ptr);
+ auto bias_id_node_ptr = bias_iter->second;
+ p.remove_connection(*bias_id_node_ptr, *node);
+ }
}
- }
+ auto rename_id = deconv_id + "_tmp";
+ p.rename(*node, rename_id);
- auto deconv_node_itr = p.nodes_map.find(rename_id);
- if (deconv_node_itr != p.nodes_map.end()) {
- auto deconv_node_ptr = deconv_node_itr->second;
- p.replace_all_usages(*deconv_node_ptr, *conv_node_ptr);
- p.optimized_out.push_back(rename_id);
- p.nodes_map.erase(rename_id);
- }
+ // create convolution primitive
+ if (biases.size() != 0) {
+ auto conv_prim = std::make_shared<convolution>(deconv_id,
+ input_id,
+ weights_vec,
+ bias_vec,
+ stride,
+ input_offset,
+ tensor{ 1, 1, 1, 1 },
+ output_padding);
+ p.get_or_create(conv_prim);
+ } else {
+ auto conv_prim = std::make_shared<convolution>(deconv_id,
+ input_id,
+ weights_vec,
+ stride,
+ input_offset,
+ tensor{ 1, 1, 1, 1 },
+ output_padding);
+ p.get_or_create(conv_prim);
+ }
+
+ auto conv_node_itr = p.nodes_map.find(deconv_id);
+ if (conv_node_itr == p.nodes_map.end()) continue;
+
+ auto conv_node_ptr = conv_node_itr->second;
+ auto conv_node = &conv_node_ptr->as<convolution>();
+ conv_node->set_transposed(true);
+
+ // add connections input->convolution, weights->convolution and bias->convolution
+ p.add_connection(input_node, *conv_node_ptr);
+
+ for (auto& weights_id : weights_vec) {
+ auto weights_node_itr = p.nodes_map.find(weights_id);
+ if (weights_node_itr == p.nodes_map.end()) continue;
+
+ auto weights_node_ptr = weights_node_itr->second;
+ p.add_connection(*weights_node_ptr, *conv_node_ptr);
+ }
+
+ if (!bias_vec.empty()) {
+ for (auto& bias_id : bias_vec) {
+ auto bias_id_node_itr = p.nodes_map.find(bias_id);
+ if (bias_id_node_itr == p.nodes_map.end()) continue;
+
+ auto bias_id_node_ptr = bias_id_node_itr->second;
+ p.add_connection(*bias_id_node_ptr, *conv_node_ptr);
+ }
+ }
+
+ auto deconv_node_itr = p.nodes_map.find(rename_id);
+ if (deconv_node_itr != p.nodes_map.end()) {
+ auto deconv_node_ptr = deconv_node_itr->second;
+ p.replace_all_usages(*deconv_node_ptr, *conv_node_ptr);
+ p.optimized_out.push_back(rename_id);
+ p.nodes_map.erase(rename_id);
+ }
- p.mark_if_data_flow(*conv_node);
- conv_node->recalc_output_layout(true);
+ update_processing_order = true;
- update_processing_order = true;
+
+ p.mark_if_data_flow(*conv_node);
+ conv_node->recalc_output_layout(true);
+
+ update_processing_order = true;
+ // current optimization only available for specific deconvolution parameters
+ } else if (node->is_output() == false &&
+ node->get_output_layout().size.feature[0] == 1 &&
+ deconv_prim->stride.spatial[0] == 2 && deconv_prim->stride.spatial[1] == 2 &&
+ filter_size.spatial[0] == 9 && filter_size.spatial[1] == 9 &&
+ deconv_prim->input_offset.spatial[0] == -4 && deconv_prim->input_offset.spatial[1] == -4 &&
+ weights_vec.size() == 1 && deconv_prim->bias.size() == 1 &&
+ node->get_dependency(0).get_output_layout().format == format::bfyx &&
+ !deconv_prim->gradient()) {
+ primitive_id deconv_id = node->id();
+ auto& input_node = node->get_dependency(0);
+ primitive_id input_id = deconv_prim->input[0];
+
+ auto scale_factor = deconv_prim->stride.spatial[0];
+
+ auto cur_weights_node_ptr = p.nodes_map.find(weights_vec[0])->second;
+ auto weights_layout = cur_weights_node_ptr->get_output_layout();
+ auto weights_data_type = weights_layout.data_type;
+
+ auto biases = deconv_prim->bias[0];
+ auto bias_id_node_ptr = p.nodes_map.find(biases)->second;
+ auto bias_data_type = bias_id_node_ptr->get_output_layout().data_type;
+
+ // enable only for fp32 and fp16
+ if (weights_data_type != data_types::f16 &&
+ weights_data_type != data_types::f32 &&
+ bias_data_type != data_types::f16 &&
+ bias_data_type != data_types::f32)
+ continue;
+
+ // setting convolution parameters based on deconvolution params
+ tensor stride = { 1, 1, 1, 1 };
+ tensor input_offset = { 0, 0, -scale_factor, -scale_factor };
+ auto output_padding = deconv_prim->output_padding;
+
+ // remove deconvolution node and its connections to weights and biases,
+ // rename it and move to the optimized list
+ p.remove_connection(node->get_dependency(0), *node);
+
+ auto weights_node_ptr = p.nodes_map.find(weights_vec[0])->second;
+ p.remove_connection(*weights_node_ptr, *node);
+ p.remove_connection(*bias_id_node_ptr, *node);
+
+ auto rename_id = deconv_id + "_tmp";
+ p.rename(*node, rename_id);
+
+ // reshape weights
+ int pixel_shuffle_size = scale_factor * scale_factor;
+ int kernel_size = 5;
+ tensor target_weights_size = { pixel_shuffle_size, filter_size.feature[0], kernel_size, kernel_size };
+ auto target_weights_layout = layout{ weights_layout.data_type, weights_layout.format, target_weights_size };
+
+ {
+ memory_impl::ptr data_to_allocate = p.get_engine().allocate_memory(target_weights_layout, 0);
+
+ std::vector<float> weights_vec_float;
+
+ if (weights_data_type == data_types::f16) {
+ mem_lock<half_t> src{ cur_weights_node_ptr->as<data>().get_attached_memory() };
+ for (uint32_t i = 0; i < weights_layout.size.count(); i++)
+ weights_vec_float.push_back(static_cast<float>(src.data()[i]));
+ } else {
+ mem_lock<float> src{ cur_weights_node_ptr->as<data>().get_attached_memory() };
+ for (uint32_t i = 0; i < weights_layout.size.count(); i++)
+ weights_vec_float.push_back(src.data()[i]);
+ }
+
+ std::vector<std::vector<std::vector<float> > > subpixel_weights(pixel_shuffle_size);
+
+ program_helpers::reshape_deconvolution_weights(weights_vec_float,
+ static_cast<int>(filter_size.feature[0]),
+ static_cast<int>(filter_size.spatial[0]),
+ static_cast<int>(filter_size.spatial[1]),
+ scale_factor,
+ subpixel_weights);
+
+ if (weights_data_type == data_types::f16) {
+ mem_lock<half_t> dst{ data_to_allocate };
+ program_helpers::set_weights_values<half_t>(dst.data(), subpixel_weights);
+ } else if (weights_data_type == data_types::f32) {
+ mem_lock<float> dst{ data_to_allocate };
+ program_helpers::set_weights_values<float>(dst.data(), subpixel_weights);
+ } else {
+ throw std::logic_error("Not supported data type.");
+ }
+
+ memory api_memory = memory(data_to_allocate.detach());
+ auto data_node_weights_replace = std::make_shared<data>(weights_vec[0] + "_conv_rpl", api_memory);
+ p.get_or_create(data_node_weights_replace);
+ auto data_node_weights_replace_node_ptr = p.nodes_map.find(weights_vec[0] + "_conv_rpl")->second;
+ auto& data_node = data_node_weights_replace_node_ptr->as<data>();
+ data_node.set_output_layout(target_weights_layout, false);
+ }
+ float bias = 0;
+
+ if (bias_data_type == data_types::f16) {
+ mem_lock<half_t> src{ bias_id_node_ptr->as<data>().get_attached_memory() };
+ bias = static_cast<float>(src.data()[0]);
+ } else {
+ mem_lock<float> src{ bias_id_node_ptr->as<data>().get_attached_memory() };
+ bias = src.data()[0];
+ }
+
+ auto deconv_id_conv = deconv_id + "_conv";
+
+ // create convolution primitive
+ auto conv_prim = std::make_shared<convolution>(deconv_id_conv,
+ input_id,
+ std::vector<primitive_id>{ weights_vec[0] + "_conv_rpl" },
+ stride,
+ input_offset,
+ tensor{ 1, 1, 1, 1 },
+ output_padding);
+ p.get_or_create(conv_prim);
+
+ auto conv_node_itr = p.nodes_map.find(deconv_id_conv);
+ if (conv_node_itr == p.nodes_map.end()) continue;
+
+ auto conv_node_ptr = conv_node_itr->second;
+ auto conv_node = &conv_node_ptr->as<convolution>();
+
+ // add connections input->convolution, weights->convolution and bias->convolution
+ p.add_connection(input_node, *conv_node_ptr);
+
+ {
+ auto weights_node_conv_rpl_ptr = p.nodes_map.find(weights_vec[0] + "_conv_rpl")->second;
+ p.add_connection(*weights_node_conv_rpl_ptr, *conv_node_ptr);
+ p.inputs.push_back(weights_node_conv_rpl_ptr.get());
+ }
+
+ auto pixel_shuffle_prim = std::make_shared<depth_to_space>(deconv_id, deconv_id_conv, 2);
+
+ p.get_or_create(pixel_shuffle_prim);
+ auto pixel_shuffle_node_ptr = p.nodes_map.find(deconv_id)->second;
+ pixel_shuffle_node_ptr->add_fused_activation(activation_func::linear, { 1, bias });
+
+ // add connections input->convolution, weights->convolution
+ p.add_connection(*conv_node_ptr, *pixel_shuffle_node_ptr);
+
+ auto deconv_node_ptr = p.nodes_map.find(rename_id);
+ if (deconv_node_ptr != p.nodes_map.end()) {
+ p.replace_all_usages(*deconv_node_ptr->second, *pixel_shuffle_node_ptr);
+ p.optimized_out.push_back(rename_id);
+ p.nodes_map.erase(rename_id);
+ }
+ p.mark_if_data_flow(*conv_node);
+ conv_node->recalc_output_layout(true);
+
+ update_processing_order = true;
+ }
}
}
/*
-// Copyright (c) 2018-2019 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
#include "api/eltwise.hpp"
#include "api/pooling.hpp"
+#include "fused_conv_eltwise_inst.h"
#include "primitive_inst.h"
#include "activation_inst.h"
#include "concatenation_inst.h"
#include "eltwise_inst.h"
#include "reshape_inst.h"
#include "scale_inst.h"
+#include "depth_to_space_inst.h"
#include "pass_manager.h"
#include "program_helpers.h"
lower_padd_in_axis += input->get_output_layout().size.raw[concat_axis];
}
+ // check if it is worth doing concat in place, in case the following primitive is convolution
+ // with different input padding than concatenation's input users' convolutions,
+ // it is likely that convolution's implementation will be a reference one, due to mismatched padding
+ // and performance gain by doing in place concat is nullified by slower convolution implementation
+ // this should be handled by more advanced tuning mechanism on the topology level
+ auto& users = node.get_users();
+ if (users.size() == 1) {
+ auto& user = users.front();
+ if (node.get_output_layout().format == format::bfyx && user->type() == convolution::type_id()) {
+ auto out_input_offsets = user->as<convolution>().get_primitive()->input_offset;
+
+ std::vector<tensor> in_input_offsets;
+ for (auto& in_user : nodes_list.first) {
+ if (in_user->type() == convolution::type_id())
+ in_input_offsets.push_back(in_user->as<convolution>().get_primitive()->input_offset);
+ }
+
+ for (auto& in_input_offset : in_input_offsets) {
+ if (in_input_offset.spatial[0] != out_input_offsets.spatial[0] &&
+ in_input_offset.spatial[1] != out_input_offsets.spatial[1])
+ return;
+ }
+ } else if (user->type() == fused_conv_eltwise::type_id()) {
+ if (!user->as<fused_conv_eltwise>().get_fused_primitives().empty() &&
+ user->as<fused_conv_eltwise>().get_fused_primitives().begin()->node->is_type<depth_to_space>())
+ return;
+ }
+ }
+
// apply concatenation in place optimization
for (auto input : nodes_list.first) {
auto input_lenght = input->get_output_layout().size.raw[concat_axis];
int p1_pnum = p.get_processing_order().get_processing_number(parents[fused_idx]);
int p2_pnum = p.get_processing_order().get_processing_number(parents[peer_idx]);
- if (p1_pnum < p2_pnum && can_fuse_parents[peer_idx]) {
+ auto p1_dt = parents[fused_idx]->get_output_layout().data_type;
+ auto p2_dt = parents[peer_idx]->get_output_layout().data_type;
+
+ if (can_fuse_parents[peer_idx] &&
+ ((p1_pnum < p2_pnum && p1_dt == p2_dt) || (data_type_traits::is_floating_point(p2_dt) && !data_type_traits::is_floating_point(p1_dt)))) {
+ // Swap in 2 cases:
+ // 1. Both branches have same data type. Select branch with lower processing number
+ // 2. Peer node has fp32 output type, but fused node - int8. In that case we have to fuse to the branch
+ // with fp32 out type to avoid fp32 blobs in the quantized graph.
std::swap(fused_idx, peer_idx);
}
auto fused_node = parents[fused_idx];
auto peer_node = parents[peer_idx];
+
if (parent1->is_type<convolution>() && !conv_supports_fusings(parent1->as<convolution>()))
return;
p.get_processing_order().calc_processing_order(p);
}
+void prepare_conv_eltw_fusing::fuse_conv_depth_to_space(program_impl& p, program_node* node) {
+ // make sure this convolution have only 1 user and it's depth_to_space
+ // make sure convolution is not an output
+ if (node->get_users().size() != 1 || node->is_output())
+ return;
+
+ if (!node->get_users().front()->is_type<depth_to_space>())
+ return;
+
+ convolution_node* conv_node = static_cast<convolution_node*>(node);
+
+ depth_to_space_node* d_t_s_node = static_cast<depth_to_space_node*>(node->users.front());
+ if (d_t_s_node->get_users().empty())
+ return;
+ if (!d_t_s_node->get_users().front()->is_type<eltwise>())
+ return;
+
+ for (auto& dep : d_t_s_node->get_dependencies()) {
+ format fmt = dep->get_output_layout().format;
+ data_types dep_dt = dep->get_output_layout().data_type;
+ if ((fmt != format::bfyx || dep_dt != data_types::f16))
+ return;
+ }
+
+ p.fuse_nodes(*conv_node, *d_t_s_node);
+}
+
void prepare_conv_eltw_fusing::fuse_conv_eltwise(program_impl& p, program_node* node) {
// make sure this convolution have only 1 user and it's eltwise
// make sure convolution is not an output
convolution_node* conv_node = static_cast<convolution_node*>(node);
convolution& conv = const_cast<convolution&>(*conv_node->get_primitive());
+ bool if_already_depth_to_space_fused = false;
+ if (!conv_node->get_fused_primitives().empty())
+ if_already_depth_to_space_fused = conv_node->get_fused_primitives().begin()->node->is_type<depth_to_space>();
+
// TODO: find a better way to check for available kernels
// currently works only for these formats
data_types data_type = conv_node->get_output_layout().data_type;
(fmt != format::byxf_af32 || dep_dt != data_types::i8) &&
(fmt != format::byxf_af32 || dep_dt != data_types::u8) &&
(fmt != format::bfyx || dep_dt != data_types::f32) && (fmt != format::bfyx || dep_dt != data_types::u8) &&
- (fmt != format::bfyx || dep_dt != data_types::i8) && (fmt != format::yxfb || dep_dt != data_types::f16))
+ (fmt != format::bfyx || dep_dt != data_types::i8) && (fmt != format::yxfb || dep_dt != data_types::f16) &&
+ (fmt != format::bfyx || dep_dt != data_types::f16 || !if_already_depth_to_space_fused))
return;
}
if (filter_size.spatial[0] == 1 && filter_size.spatial[1] == 1) {
if (conv.stride.spatial[0] != 1 || conv.stride.spatial[1] != 1)
return;
- } else {
+ } else if (!if_already_depth_to_space_fused) {
return;
}
}
// make sure eltwise have only 2 inputs
// make sure eltwise is not an output
- if (eltw_node->inputs_count() != 2 || eltw_node->is_output())
+ if (!if_already_depth_to_space_fused && (eltw_node->inputs_count() != 2 || eltw_node->is_output()))
return;
// only single ADD operation is currently supported
if (eltw_node->input(eltw_fused_input_idx).id() != conv.id)
return;
+ auto fused_output_layout_size = eltw_node->input(eltw_second_input_idx).get_output_layout().size;
+ auto conv_output_layout_size = conv_node->get_output_layout().size;
+
+ if (fused_output_layout_size.spatial[0] * fused_output_layout_size.spatial[1] * fused_output_layout_size.feature[0] * fused_output_layout_size.batch[0]
+ != conv_output_layout_size.spatial[0] * conv_output_layout_size.spatial[1] * conv_output_layout_size.feature[0] * conv_output_layout_size.batch[0])
+ return;
+
// get strides for other than our conv input
std::vector<tensor> new_eltw_strides;
// conv strides modified by eltwise stride
// Copy output data type from eltwise
fused_conv_eltw->output_data_type = eltw_node->get_output_layout().data_type;
+ fused_conv_eltw->depth_to_space_already_fused = if_already_depth_to_space_fused;
+
auto& new_node = p.get_or_create(fused_conv_eltw);
for (size_t i = 0; i < eltw_node->get_fused_activations_funcs().size(); i++)
new_node.dependencies = updated_deps;
+ if (if_already_depth_to_space_fused) {
+ new_node.add_fused_primitives(conv_node->get_fused_primitives());
+ }
+
// Extract convolution node - will replace its usage in fused with input
p.extract_and_remove(*conv_node);
- new_node.recalc_output_layout();
+
+ // To change convolution's output to image type, make sure that it is the last primitive in the topology,
+ // or only reorder is afterwards and it is network's output
+ auto reorder_user = (new_node.get_users().size() == 1);
+ if (reorder_user)
+ reorder_user &= ((new_node.get_users().front()->is_type<reorder>()) && (new_node.get_users().front()->is_output()));
+ if (if_already_depth_to_space_fused && (new_node.get_users().size() == 0 || reorder_user)) {
+ cldnn::layout new_layout = { data_types::u8, format::image_2d_rgba, fused_output_layout_size };
+ new_node.set_output_layout(new_layout);
+ // Remove output reorder if present
+ if (reorder_user) {
+ auto& reorder_node = new_node.get_users().front();
+ reorder_node->remove_dependency(1);
+ p.extract_and_remove(*reorder_node);
+ }
+ } else {
+ new_node.recalc_output_layout();
+ }
p.add_optimized_primitive_info(conv_id, {new_node.id()});
p.add_optimized_primitive_info(eltw_id, {new_node.id()});
auto& node = (*node_itr);
+ fuse_conv_depth_to_space(p, node);
+
fuse_conv_eltwise(p, node);
}
}
auto node_itr = itr++;
auto& node = (*node_itr);
- if (node->is_output())
- continue;
-
// Detects if given eltwise node performs zero point subtraction
auto is_zero_point_node = [](eltwise_node& node) -> bool {
auto prim = node.get_primitive();
// Remove sub operations from the graph and set correct users for zero points and inputs
if (asymmetric_data) {
if (!new_a_zp || !new_input)
- CLDNN_ERROR_MESSAGE(convolution_node.id(), "Unexpected nullptr in asymmetric quantization for activations optimization");
+ CLDNN_ERROR_MESSAGE(new_conv_node.id(), "Unexpected nullptr in asymmetric quantization for activations optimization");
auto& zp_users = new_a_zp->users;
auto& in_users = new_input->users;
if (asymmetric_weights) {
if (!new_w_zp || !new_weights)
- CLDNN_ERROR_MESSAGE(convolution_node.id(), "Unexpected nullptr in asymmetric quantization for weights optimization");
+ CLDNN_ERROR_MESSAGE(new_conv_node.id(), "Unexpected nullptr in asymmetric quantization for weights optimization");
auto& zp_users = new_w_zp->users;
auto& wei_users = new_weights->users;
using namespace cldnn;
-remove_redundant_reorders::remove_redundant_reorders(layout_optimizer& lo_ref, bool enable_reorder_fusing, bool update_implementations)
- : base_pass("remove_redundant_reorders"), lo(lo_ref), enable_reorder_fusing(enable_reorder_fusing), update_implementations(update_implementations) {}
+remove_redundant_reorders::remove_redundant_reorders(layout_optimizer& lo_ref, bool enable_reorder_fusing, bool update_implementations,
+ bool remove_output_reorders)
+ : base_pass("remove_redundant_reorders"), lo(lo_ref), enable_reorder_fusing(enable_reorder_fusing), update_implementations(update_implementations),
+ remove_output_reorders(remove_output_reorders) {}
void remove_redundant_reorders::run(program_impl& p) {
auto update_implementation = [&](program_node& node) {
auto& r_node = node->as<reorder>();
+ bool no_output_optimization = remove_output_reorders ?
+ r_node.is_output() && (r_node.get_dependency(0).is_output() || r_node.get_dependency(0).is_type<input_layout>() ||
+ r_node.get_dependency(0).can_be_optimized()) : r_node.is_output();
+
if (r_node.has_mean() ||
!r_node.get_primitive()->subtract_per_feature.empty() ||
- r_node.is_output() ||
+ no_output_optimization ||
!r_node.get_fused_activations_funcs().empty())
continue;
// Optimize reorder b_fs_yx_fsv16 -> bfyx when spatials are equal to 1. In this case we can reinterpret buffer,
// but pads need to be handled correctly.
- if (i_layout.format == format::b_fs_yx_fsv16 && o_layout.format == format::bfyx &&
+ if (i_layout.format == format::b_fs_yx_fsv16 && o_layout.format == format::bfyx && !r_node.is_output() &&
i_layout.size.spatial[0] == 1 && i_layout.size.spatial[1] == 1 &&
o_layout.data_padding.upper_size() == (tensor)0 && o_layout.data_padding.lower_size() == (tensor)0) {
r_node.can_be_optimized(true);
#include "layout_optimizer.h"
#include "program_impl.h"
#include "program_helpers.h"
+#include "mvn_inst.h"
#include <vector>
#include <memory>
#include <list>
continue;
auto fmt = fmt_map.at(node);
- if (fmt == format::any)
+ if (fmt == format::any || format::is_image(fmt))
continue;
insert_reorders_in_dir<direction_e::forwards>(p, fmt_map, rf, node);
continue;
auto fmt = fmt_map.at(node);
- if (fmt == format::any)
+ if (fmt == format::any || format::is_image(fmt))
continue;
insert_reorders_in_dir<direction_e::backwards>(p, fmt_map, rf, node);
auto fmt_map = get_preferred_formats(p, lo);
propagate_formats(p, fmt_map, lo);
minimize_local_reorders(p, fmt_map, lo);
+
+ // WA START ============================================================================================================
+ if (lo.get_optimization_attributes().b_fs_yx_fsv16_network) {
+ // This is a temprorary work-around for known bad case until byxf_af32 handling will be corrected in layout_optimizer.
+ //
+ // Find pattern:
+ // mvn(int8, b_fs_yx_fsv16, [x,16,1280,720]) -> conv(int8, byxf_af32, [x,3,1280,720]) -> mvn(*, bfyx) ->
+ // Replace with:
+ // mvn(b_fs_yx_fsv16) -> conv(b_fs_yx_fsv16) -> mvn(b_fs_yx_fsv16) ->
+ //
+ // Generally for such convolution b_fs_yx_fsv16 will always perform better than byxf_af32,
+ // but to avoid unvalidated int8 b_fs_yx_fsv16 networks and potential regressions this WA is needed.
+ // Additionally reorder from af32 -> bfyx will take ~9 times longer than actual convolution.
+ for (auto& node_ptr : p.get_processing_order()) {
+ if (!node_ptr->is_in_data_flow() || !node_ptr->is_type<convolution>() || fmt_map.at(node_ptr) != format::byxf_af32)
+ continue;
+
+ auto& conv_node = node_ptr->as<convolution>();
+
+ bool input_path =
+ conv_node.input().get_output_layout().data_type == data_types::i8 &&
+ conv_node.input().is_type<mvn>() &&
+ fmt_map.at(&conv_node.input()) == format::b_fs_yx_fsv16;
+ bool output_path =
+ conv_node.get_users().size() == 1 &&
+ conv_node.get_users().front()->is_type<mvn>() &&
+ fmt_map.at(conv_node.get_users().front()) == format::bfyx &&
+ conv_node.get_users().front()->get_users().size() == 1 &&
+ !conv_node.get_users().front()->as<mvn>().get_primitive()->across_channels;
+
+ if (!input_path || !output_path)
+ continue;
+
+ auto in_lay = conv_node.input().get_output_layout();
+ auto out_lay = conv_node.get_output_layout();
+ auto wei_lay = conv_node.weights().get_output_layout();
+ bool correct_layouts =
+ // weights
+ wei_lay.data_type == data_types::i8 &&
+ wei_lay.size.spatial[0] == 3 && wei_lay.size.spatial[1] == 3 &&
+ // input/output
+ in_lay.data_type == data_types::i8 && out_lay.data_type == data_types::i8 &&
+ in_lay.size.feature[0] == 16 && out_lay.size.feature[0] == 3 &&
+ in_lay.size.spatial[0] == 1280 && out_lay.size.spatial[0] == 1280 &&
+ in_lay.size.spatial[1] == 720 && out_lay.size.spatial[1] == 720;
+
+ if (!correct_layouts)
+ continue;
+
+ bool correct_conv =
+ conv_node.get_groups() == 1 && conv_node.get_split() == 1 && conv_node.get_deformable_groups() == 1 &&
+ !conv_node.get_depthwise_sep_opt() && !conv_node.get_transposed() &&
+ !conv_node.activations_zero_points_term() && !conv_node.weights_zero_points_term() && !conv_node.compensation_term() &&
+ conv_node.get_primitive()->dilation == tensor(1);
+
+ if (!correct_conv)
+ continue;
+
+ fmt_map.at(node_ptr) = format::b_fs_yx_fsv16;
+ fmt_map.at(conv_node.get_users().front()) = format::b_fs_yx_fsv16;
+ }
+ }
+ // WA END ==============================================================================================================
+
insert_reorders(p, fmt_map, rf);
for (auto n : p.get_processing_order()) {
#pragma once
#include "api/depth_to_space.hpp"
#include "primitive_inst.h"
+#include "kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.h"
#include <string>
+#include <memory>
namespace cldnn {
template <>
using parent::parent;
program_node& input(size_t index = 0) const { return get_dependency(index); }
+ std::shared_ptr<kernel_selector::fuse_params> get_fuse_params() const override {
+ return std::make_shared<kernel_selector::depth_to_space_fuse_params>();
+ }
};
using depth_to_space_node = typed_program_node<depth_to_space>;
private:
void run(program_impl& p) override;
void fuse_conv_eltwise(program_impl& p, program_node* node);
+ void fuse_conv_depth_to_space(program_impl& p, program_node* node);
layout_optimizer& _lo;
bool b_fs_yx_fsv16_opt;
};
class remove_redundant_reorders : public base_pass {
public:
- explicit remove_redundant_reorders(layout_optimizer& lo_ref, bool enable_reorder_fusing = false, bool update_implementations = false);
+ explicit remove_redundant_reorders(layout_optimizer& lo_ref, bool enable_reorder_fusing = false, bool update_implementations = false,
+ bool remove_output_reorders = false);
void run(program_impl& p) override;
private:
layout_optimizer& lo;
bool enable_reorder_fusing;
bool update_implementations;
+ bool remove_output_reorders;
};
class reorder_inputs : public base_pass {
const layout& target_layout,
size_t begin_offset,
size_t end_offset);
- static layout get_weights_layout(typed_program_node<cldnn::data>& data_node, int32_t split);
+
static std::pair<bool, bool> are_layouts_identical(layout const& l1, layout const& l2);
+
+ // helper functions for deconvolution optimizations
+ static void reshape_deconvolution_weights(const std::vector<float> &deconv_weights,
+ const int channels,
+ const int kernel_width,
+ const int kernel_height,
+ const int scale_factor,
+ std::vector<std::vector<std::vector<float> > >& subpixel_weights);
+ template <typename T>
+ static void set_weights_values(T* mem, std::vector<std::vector<std::vector<float> > > args) {
+ for (uint32_t x = 0; x < static_cast<uint32_t>(args.size()); ++x) {
+ for (uint32_t y = 0; y < static_cast<uint32_t>(args[x].size()); ++y) {
+ for (uint32_t z = 0; z < static_cast<uint32_t>(args[x][y].size()); ++z) {
+ *mem = static_cast<T>(args[x][y][z]);
+ mem++;
+ }
+ }
+ }
+ }
+ static layout get_weights_layout(typed_program_node<cldnn::data>& data_node, int32_t split);
};
} // namespace cldnn
return "b_fs_zyx_fsv16";
case format::bs_fs_zyx_bsv16_fsv16:
return "bs_fs_zyx_bsv16_fsv16";
+ case format::image_2d_rgba:
+ return "image_2d_rgba";
case format::oiyx:
return "oiyx";
return "image_2d_weights_winograd_6x3_s1_xfbyb";
case format::os_iyx_osv16:
return "os_iyx_osv16";
+ case format::os_is_yx_osv16_isv16:
+ return "os_is_yx_osv16_isv16";
case format::os_iyx_osv32:
return "os_iyx_osv32";
case format::os_iyx_osv64:
return "g_os_is_yx_isv8_osv16_isv2";
case format::g_os_is_zyx_isv16_osv16:
return "g_os_is_zyx_isv16_osv16";
+ case format::g_os_is_yx_osv16_isv4:
+ return "g_os_is_yx_osv16_isv4";
default:
return "unknown (" + std::to_string(fmt.value) + ")";
}
return kernel_selector::data_layout::bs_fs_yx_bsv16_fsv16;
case format::nv12:
return kernel_selector::data_layout::nv12;
+ case format::image_2d_rgba:
+ return kernel_selector::data_layout::image_2d_rgba;
default:
throw std::invalid_argument("Format f (" + std::to_string((int32_t)f.value) + ") is not a proper data layout");
}
return cldnn::format::b_fs_yx_fsv4;
case kernel_selector::data_layout::nv12:
return cldnn::format::nv12;
+ case kernel_selector::data_layout::image_2d_rgba:
+ return cldnn::format::image_2d_rgba;
default:
throw std::invalid_argument("Unable to convert data layout " + std::to_string(l) + " to tensor format");
}
return kernel_selector::weights_layout::yxio;
case format::os_iyx_osv16:
return kernel_selector::weights_layout::os_iyx_osv16;
+ case format::os_is_yx_osv16_isv16:
+ return kernel_selector::weights_layout::os_is_yx_osv16_isv16;
case format::os_iyx_osv32:
return kernel_selector::weights_layout::os_iyx_osv32;
case format::os_iyx_osv64:
return kernel_selector::weights_layout::g_os_is_yx_isv8_osv16_isv2;
case format::g_os_is_zyx_isv16_osv16:
return kernel_selector::weights_layout::g_os_is_zyx_isv16_osv16;
+ case format::g_os_is_yx_osv16_isv4:
+ return kernel_selector::weights_layout::g_os_is_yx_osv16_isv4;
default:
throw std::invalid_argument("Unable to convert tensor layout " + fmt_to_str(f) + " to weights layout");
}
return cldnn::format::yxfb;
case kernel_selector::weights_layout::os_iyx_osv16:
return cldnn::format::os_iyx_osv16;
+ case kernel_selector::weights_layout::os_is_yx_osv16_isv16:
+ return cldnn::format::os_is_yx_osv16_isv16;
case kernel_selector::weights_layout::os_iyx_osv32:
return cldnn::format::os_iyx_osv32;
case kernel_selector::weights_layout::os_iyx_osv64:
return cldnn::format::g_os_is_yx_isv8_osv16_isv2;
case kernel_selector::weights_layout::g_os_is_zyx_isv16_osv16:
return cldnn::format::g_os_is_zyx_isv16_osv16;
+ case kernel_selector::weights_layout::os_is_yx_osv16_isv4:
+ return cldnn::format::g_os_is_yx_osv16_isv4;
default:
return cldnn::format::bfyx;
}
kernel_selector::n_dims vec(kernel_selector::DataTensor::ChannelsCount(ks_layout));
size_t pitch = 1;
-
auto new_vals = vals;
if (ks_layout == kernel_selector::Tensor::byxf_af32) {
params.engineInfo.bSubGroupSupport = context->extension_supported("cl_intel_subgroups");
params.engineInfo.bSubGroupShortSupport = context->extension_supported("cl_intel_subgroups_short");
+ params.engineInfo.bSubGroupCharSupport = context->extension_supported("cl_intel_subgroups_char");
params.engineInfo.bFP16Support = context->extension_supported("cl_khr_fp16");
params.engineInfo.bFP64Support = context->extension_supported("cl_khr_fp64");
params.engineInfo.bIMADSupport = device_info.supports_imad != 0;
#include "eltwise_inst.h"
#include "pooling_inst.h"
#include "permute_inst.h"
+#include "quantize_inst.h"
+#include "mvn_inst.h"
#include <vector>
#include <memory>
#include <utility>
if (node.is_type<fully_connected>() && fmt == format::byxf)
return false;
+ if (node.is_type<mvn>() && fmt == format::b_fs_yx_fsv16 &&
+ node.get_dependency(0).get_output_layout().data_type != data_types::i8 &&
+ node.get_dependency(0).get_output_layout().data_type != data_types::u8)
+ return false;
+
if (node.is_type<input_layout>())
return node.get_output_layout().format == fmt;
fmt_prev == format::bfyx &&
((fmt_next == format::fs_b_yx_fsv32 && next.as<convolution>().get_primitive()->groups == 1) ||
(fmt_next == format::b_fs_yx_fsv32 && prev_output_layout.size.feature[0] == 3) ||
- (fmt_next == format::b_fs_yx_fsv16 && next_output_layout.size.feature[0] >= 16 && prev_output_layout.size.feature[0] == 3) ||
+ (fmt_next == format::b_fs_yx_fsv16 && next_output_layout.size.feature[0] >= 16 &&
+ prev_output_layout.size.feature[0] == 3 &&
+ (next_output_layout.data_type != data_types::i8 && next_output_layout.data_type != data_types::u8)) ||
+ (fmt_next == format::b_fs_yx_fsv16 && next_output_layout.size.feature[0] >= 16 && prev_output_layout.size.feature[0] == 3) ||
(fmt_next == format::bs_fs_yx_bsv16_fsv16 && next_output_layout.size.feature[0] % 16 == 0 && prev_output_layout.size.feature[0] == 3)))
return true;
return true;
if (prev.is_type<quantize>() &&
- (fmt_next == format::b_fs_yx_fsv4 || fmt_next == format::b_fs_yx_fsv32 || fmt_next == format::b_fs_zyx_fsv32))
+ (fmt_next == format::b_fs_yx_fsv4 || fmt_next == format::b_fs_yx_fsv32 || fmt_next == format::b_fs_zyx_fsv32 || fmt_next == format::b_fs_yx_fsv16))
return true;
return false;
const layout &weights_layout,
std::shared_ptr<const convolution> conv,
bool weak_restrictions) {
- // A set of rules that define when b_fs_yx_fsv16 mem format can be used
+ // A set of rules that define when b_fs_yx_fsv16 mem format can be used for int8 case
+ bool i8_dt_case = (input_layout.data_type == data_types::u8 || input_layout.data_type == data_types::i8) &&
+ weights_layout.data_type == data_types::i8 &&
+ (conv->activations_zero_points.empty() && conv->weights_zero_points.empty()); // only symmetric
+ if (i8_dt_case) {
+ auto ks_x = weights_layout.size.spatial[0];
+ auto ks_y = weights_layout.size.spatial[1];
+ if (input_layout.size.spatial[2] == 1 &&
+ input_layout.size.batch[0] < 16 &&
+ ((ks_x == 7 && ks_y == 7) || (ks_x == 3 && ks_y == 3) || (ks_x == 1 && ks_y == 1) || (ks_x == 5 && ks_y == 5)) &&
+ weights_layout.size.batch[0] >= 16 &&
+ ((conv->groups == 1 && conv->split() == 1) ||
+ conv->groups == static_cast<uint32_t>(input_layout.size.feature[0]) ||
+ conv->split() == static_cast<int32_t>(input_layout.size.feature[0])) &&
+ conv->dilation == tensor{ 1 })
+ return true;
+ }
+ // A set of rules that define when b_fs_yx_fsv16 mem format can be used for fp16/fp32 case
auto feature_block_size = 16;
auto correct_data_type = input_layout.data_type == data_types::f16 || input_layout.data_type == data_types::f32;
correct_data_type &= weights_layout.data_type == input_layout.data_type;
if (dims_count == 5 && is_grouped) {
return format::bfzyx;
} else if (dims_count == 4 && is_grouped && !is_dw) {
- return format::bfyx;
+ return format::b_fs_yx_fsv4;
}
bool asymmetric_quantization = node.activations_zero_points_term() || node.weights_zero_points_term();
const float cond_denom = _total_conv > 0 ? 1.0f / static_cast<float>(_total_conv) : 1.0f;
if ((input_layout.data_type == data_types::u8 || input_layout.data_type == data_types::i8)) {
- expected_format = imad_case(node);
+ if ((_optimization_attributes.b_fs_yx_fsv16_network &&
+ convolution_b_fs_yx_fsv16_opt(input_layout, output_or_weights_layout, prim))) {
+ expected_format = cldnn::format::b_fs_yx_fsv16;
+ } else {
+ expected_format = imad_case(node);
+ }
expected_tensor = current_layout.size;
} else if (_optimization_attributes.b_fs_zyx_fsv16_network &&
convolution_b_fs_zyx_fsv16_opt(input_layout,
#include "reshape_inst.h"
#include "activation_inst.h"
#include "scale_inst.h"
+#include "depth_to_space_inst.h"
#include "convolution_inst.h"
#include "concatenation_inst.h"
#include "crop_inst.h"
#include "proposal_inst.h"
#include "reorder_inst.h"
#include "split_inst.h"
+#include "mvn_inst.h"
#include "to_string_utils.h"
#include "gpu/memory_gpu.h"
// ToDo remove hidden dependencies from propagate_constants pass
apply_opt_pass<propagate_constants>();
}
+
+ if (options.get<build_option_type::optimize_data>()->enabled())
+ apply_opt_pass<remove_redundant_reorders>(lo, false, true, true); // pass to remove output reorders while all others graph optimizations were done
}
// mark if the node is constant assuming that all dependencies are marked properly
prim.type() != cldnn::prior_box::type_id() &&
prim.type() != cldnn::resample::type_id() &&
prim.type() != cldnn::crop::type_id() &&
- prim.type() != cldnn::scale::type_id())
+ prim.type() != cldnn::scale::type_id() &&
+ prim.type() != cldnn::depth_to_space::type_id() &&
+ (prim.type() != cldnn::mvn::type_id()
+ || (prim.as<mvn>().input().get_output_layout().data_type != data_types::u8 &&
+ prim.as<mvn>().input().get_output_layout().data_type != data_types::i8)
+ || prim.as<mvn>().get_primitive()->across_channels))
can_use_fsv16 = false;
// WA to keep bfyx_f16 layout disabled for some topologies where it leads to regressions.
#include "data_inst.h"
#include <algorithm>
#include <utility>
+#include <vector>
namespace cldnn {
// helper function for merging the weights/biases buffers on cpu side for depthwise separable convolution optimization
data_node.attach_memory(*data_to_allocate, false);
}
+void program_helpers::reshape_deconvolution_weights(const std::vector<float> &deconv_weights,
+ const int channels,
+ const int kernel_width,
+ const int kernel_height,
+ const int scale_factor,
+ std::vector<std::vector<std::vector<float> > >& subpixel_weights) {
+
+ std::vector<std::vector<float> > weights(channels);
+
+ int pad_zero_x = kernel_width % 2 == 0 ? 0 : 1;
+ int pad_zero_y = kernel_height % 2 == 0 ? 0 : 1;
+
+ // reshape 9x9 deconv weights, for example 32 9x9 deconv weights to 32 10x10 conv weights
+ for (int f = 0; f < channels; ++f) {
+ for (int kernel_y = 0; kernel_y < kernel_height; ++kernel_y) {
+ for (int kernel_x = 0; kernel_x < kernel_width; ++kernel_x) {
+ int index = f * kernel_width * kernel_height + kernel_y * kernel_width + kernel_x;
+ weights[f].push_back(deconv_weights[index]);
+ }
+ if (pad_zero_x == 1) { // pad with zero on x axis
+ weights[f].push_back(0.f);
+ }
+ }
+ if (pad_zero_y == 1) { // pad a line on y axis with zero
+ for (int kernel_x = 0; kernel_x < kernel_width + pad_zero_x; ++kernel_x) {
+ weights[f].push_back(0.f);
+ }
+ }
+ }
+
+ // reshape 32 10x10 weights to 4 32 5x5 weights
+ for (int s = 0; s < scale_factor*scale_factor; ++s) {
+ subpixel_weights[s].resize(channels);
+ }
+
+ const int kernel_sz = kernel_width + pad_zero_x;
+
+ auto get_row_index = [](int index, const int kernel_sz)->int {
+ bool isRowEven = (index / (kernel_sz)) % 2 == 0 ? true : false;
+ bool isColEven = (index % 2) == 0 ? true : false;
+ int kernel_num = isRowEven ? (isColEven ? 0 : 1) : isColEven ? 2 : 3;
+ return kernel_num;
+ };
+
+ int feature_num = static_cast<int>(weights.size());
+ for (int f = 0; f < feature_num; ++f) {
+ for (int i = 0; i < static_cast<int>(weights[f].size()); ++i) {
+ int row = get_row_index(i, kernel_sz);
+ subpixel_weights[row][f].push_back(weights[f][i]);
+ }
+ }
+
+ // dump the weights for the shuffled kernel
+ int subpixel_conv_num = static_cast<int>(subpixel_weights.size());
+ for (int s = 0; s < subpixel_conv_num; ++s) {
+ for (int row = 0; row < static_cast<int>(subpixel_weights[s].size()); ++row) {
+ std::reverse(std::begin(subpixel_weights[s][row]), std::end(subpixel_weights[s][row]));
+ }
+ }
+}
+
// helper function for getting target layout used in depthwise sep optimization
layout program_helpers::get_weights_layout(typed_program_node<cldnn::data>& data_node, int32_t split) {
auto mem_layout = data_node.get_output_layout();
if (ofmt != ifmt)
return layout(odt, ofmt, data_size, op);
- CLDNN_ERROR_MESSAGE(node.id(), "Reordering between winograd weights and data formats is unsupported");
+ CLDNN_ERROR_MESSAGE(node.id(), "No image_nv12 to image_nv12 reorder is supported");
} else if (ofmt.is_winograd() && ifmt.is_winograd()) {
if (ofmt == ifmt)
return layout(odt, ofmt, input_layout.size, op);
CLDNN_ERROR_MESSAGE(node.id(), "Reordering between winograd weights and data formats is unsupported");
+ } else if (ifmt == format::image_2d_rgba) {
+ return layout(data_types::f16, format::bfyx, input_layout.size, op);
}
// transformation of data from standard to winograd
std::vector<int32_t> output_shape;
if (std::find(desc->new_axis_mask.begin(), desc->new_axis_mask.end(), 1) == desc->new_axis_mask.end()) {
for (size_t i = 0; i < dims_num; ++i) {
- int32_t outputDimSize = (end[i] - begin[i]) / strides[i];
- if ((end[i] - begin[i]) % strides[i] != 0)
+ int32_t b = begin[i] < 0 ? input_layout.size.sizes(input_format)[i] - 1 : begin[i];
+ int32_t e = end[i] < 0 ? input_layout.size.sizes(input_format)[i] - 1 : end[i];
+ int32_t s = strides[i];
+ int32_t outputDimSize = std::abs((e - b) / s);
+ if ((e - b) % s != 0)
outputDimSize++;
output_shape.push_back(outputDimSize);
}
}
}
+TEST(concat_gpu, mixed_input_types_5d) {
+ const auto& engine = get_test_engine();
+
+ auto input0 = memory::allocate(engine, { data_types::f16, format::bfzyx, { 1, 1, 1, 4, 3 } });
+ auto input1 = memory::allocate(engine, { data_types::f16, format::bfzyx, { 1, 1, 1, 4, 3 } });
+ auto input2 = memory::allocate(engine, { data_types::f16, format::bfzyx, { 1, 1, 1, 4, 3 } });
+ auto input3 = memory::allocate(engine, { data_types::f16, format::bfzyx, { 1, 1, 1, 4, 3 } });
+
+ set_values(input0, { half_t(1.0f), half_t(2.0f), half_t(3.0f),
+ half_t(4.0f), half_t(2.0f), half_t(2.0f),
+ half_t(3.0f), half_t(4.0f), half_t(3.0f),
+ half_t(3.0f), half_t(3.0f), half_t(5.0f) });
+ set_values(input1, { half_t(11), half_t(12), half_t(13),
+ half_t(14), half_t(12), half_t(12),
+ half_t(13), half_t(14), half_t(13),
+ half_t(13), half_t(13), half_t(15) });
+ set_values(input2, { half_t(21), half_t(22), half_t(23),
+ half_t(24), half_t(22), half_t(22),
+ half_t(23), half_t(24), half_t(23),
+ half_t(23), half_t(23), half_t(25) });
+ set_values(input3, { half_t(31.f), half_t(32.f), half_t(33.f),
+ half_t(34.f), half_t(32.f), half_t(32.f),
+ half_t(33.f), half_t(34.f), half_t(33.f),
+ half_t(33.f), half_t(33.f), half_t(35.f) });
+
+ VF<float> output_vec = {
+ 1.0f, 2.0f, 3.0f, 4.0f, 2.0f, 2.0f, 3.0f, 4.0f, 3.0f, 3.0f, 3.0f, 5.0f,
+ 11.0f, 12.0f, 13.0f, 14.0f, 12.0f, 12.0f, 13.0f, 14.0f, 13.0f, 13.0f, 13.0f, 15.0f,
+ 21.0f, 22.0f, 23.0f, 24.0f, 22.0f, 22.0f, 23.0f, 24.0f, 23.0f, 23.0f, 23.0f, 25.0f,
+ 31.0f, 32.0f, 33.0f, 34.0f, 32.0f, 32.0f, 33.0f, 34.0f, 33.0f, 33.0f, 33.0f, 35.0f };
+
+ topology topology(
+ input_layout("input0", input0.get_layout()),
+ input_layout("input1", input1.get_layout()),
+ input_layout("input2", input2.get_layout()),
+ input_layout("input3", input3.get_layout()),
+ concatenation("concat",
+ { "input0", "input1", "input2", "input3" },
+ concatenation::concatenation_axis::along_f,
+ data_types::f32,
+ padding{ { 0,0,0,0 }, 0 })
+ );
+
+ network network(engine, topology);
+ network.set_input_data("input0", input0);
+ network.set_input_data("input1", input1);
+ network.set_input_data("input2", input2);
+ network.set_input_data("input3", input3);
+
+ auto outputs = network.execute();
+ EXPECT_EQ(outputs.size(), size_t(1));
+ EXPECT_EQ(outputs.begin()->first, "concat");
+
+ auto output_memory = outputs.at("concat").get_memory();
+ auto output_layout = output_memory.get_layout();
+ auto output_ptr = output_memory.pointer<float>();
+
+ int z_size = output_layout.size.spatial[2];
+ int y_size = output_layout.size.spatial[1];
+ int x_size = output_layout.size.spatial[0];
+ int f_size = output_layout.size.feature[0];
+ int b_size = output_layout.size.batch[0];
+ EXPECT_EQ(output_layout.format, format::bfzyx);
+ EXPECT_EQ(z_size, 3);
+ EXPECT_EQ(y_size, 4);
+ EXPECT_EQ(x_size, 1);
+ EXPECT_EQ(f_size, 4);
+ EXPECT_EQ(b_size, 1);
+
+ for (size_t x = 0; x < output_layout.count(); ++x) {
+ EXPECT_EQ(output_vec[x], output_ptr[x]);
+ }
+}
+
using TestParamType_concat = ::testing::tuple<size_t, // 0 - Input Batch size
std::vector<size_t>, // 1 - Inputs Features Sizes
size_t, // 2 - Input Y Size
template<typename InputT, typename OutputT = InputT, typename WeightsT = InputT, typename AccT = typename convolution_accumulator<InputT>::type>
VVF<OutputT> reference_convolve(VVVF<InputT> &input, VVVF<WeightsT> &filter, int stride_y, int stride_x, float bias, int dilation_y = 1, int dilation_x = 1,
int input_padding_y = 0, int input_padding_x = 0, int output_padding_y = 0,
- int output_padding_x = 0, size_t f_begin = 0, size_t f_end = 0, bool depthwise = false,
+ int output_padding_x = 0, size_t f_begin = 0, size_t f_end = 0, bool depthwise = false, bool grouped = false,
const VF<InputT>& data_zp = {}, const WeightsT& weights_zp = 0)
{
size_t kernel_extent_y = dilation_y * (filter[0].size() - 1) + 1;
for (size_t yf = 0; yf < filter[0].size(); ++yf) {
int yi = -input_padding_y + (int)yf * dilation_y + stride_y * (int)y;
bool yi_inside = yi >= 0 && (int)input[0].size() > yi;
- if (!yi_inside && !asymm_data) continue;
+ if (!yi_inside) continue;
for (size_t xf = 0; xf < filter[0][0].size(); ++xf) {
int xi = -input_padding_x + (int)xf * dilation_x + stride_x * (int)x;
bool xi_inside = xi >= 0 && (int)input[0][0].size() > xi;
- if (!xi_inside && !asymm_data) continue;
+ if (!xi_inside) continue;
- AccT input_val;
- if (xi_inside && yi_inside) {
- input_val = static_cast<AccT>(input[f][yi][xi]);
- } else {
- input_val = static_cast<AccT>(0);
- }
+ auto input_val = static_cast<AccT>(input[f][yi][xi]);
if (asymm_data) {
input_val = input_val - static_cast<AccT>(data_zp[f]);
}
AccT weights_val;
- if (!depthwise) {
+ if (!depthwise && !grouped) {
weights_val = static_cast<AccT>(filter[f][yf][xf]);
- } else {
+ } else if (grouped) {
+ weights_val = static_cast<AccT>(filter[f - filter_begin][yf][xf]);
+ }
+ else {
weights_val = static_cast<AccT>(filter[0][yf][xf]);
}
int, // 5 - Output padding
bool>; // 6 - With bias
+using TestParamType_grouped_convolution_gpu = ::testing::tuple< int, // 0 - Input X size
+ int, // 1 - Input Y size
+ int, // 2 - Input features
+ int, // 3 - Output features
+ int, // 4 - Kernel sizeX
+ int, // 5 - Kernel sizeY
+ int, // 6 - Groups number
+ int, // 7 - Stride
+ int>; // 8 - Batch
+
struct convolution_gpu : public ::testing::TestWithParam<TestParamType_convolution_gpu>
{
static std::string
}
};
+struct convolution_grouped_gpu : public ::testing::TestWithParam<TestParamType_grouped_convolution_gpu> {
+ static std::string PrintToStringParamName(
+ testing::TestParamInfo<TestParamType_grouped_convolution_gpu> param_info) {
+ // construct a readable name
+ return "in" + std::to_string(testing::get<0>(param_info.param)) + "x" +
+ std::to_string(testing::get<1>(param_info.param)) + "y" +
+ std::to_string(testing::get<2>(param_info.param)) + "f" +
+ "_output" + std::to_string(testing::get<3>(param_info.param)) + "f" +
+ "_filter" + std::to_string(testing::get<4>(param_info.param)) + "x" +
+ std::to_string(testing::get<5>(param_info.param)) + "y" +
+ "_groups" + std::to_string(testing::get<6>(param_info.param)) +
+ "_stride" + std::to_string(testing::get<7>(param_info.param)) +
+ "_batch" + std::to_string(testing::get<8>(param_info.param));
+ }
+};
+
TEST_P(convolution_gpu, b_fs_yx_fsv4)
{
const int in_B = 2;
}
}
+TEST(convolution_f16_fsv_gpu, convolution_f16_fsv_gpu_padding) {
+ const auto& engine = get_test_engine();
+
+ if (!engine.get_info().supports_fp16)
+ {
+ std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl;
+ EXPECT_EQ(1, 1);
+ return;
+ }
+
+ const int batch_num = 2;
+ const int input_xy = 32;
+ const int input_f = 96;
+ const int output_f = 192;
+ const int filter_xy = 1;
+ const int stride = 1;
+ const int output_xy = 1 + (input_xy - filter_xy) / stride;
+
+ auto input_size = tensor(batch_num, input_f, input_xy, input_xy);
+ auto input_data = generate_random_4d<FLOAT16>(batch_num, input_f, input_xy, input_xy, -1, 1);
+ auto input_data_bfyx = flatten_4d(format::bfyx, input_data);
+ auto input_mem = memory::allocate(engine, { data_types::f16, format::bfyx, input_size });
+ set_values(input_mem, input_data_bfyx);
+
+ auto weights_size = tensor(output_f, input_f, filter_xy, filter_xy);
+ auto weights_data = generate_random_4d<FLOAT16>(output_f, input_f, filter_xy, filter_xy, -1, 1);
+ auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);
+ auto weights_mem = memory::allocate(engine, { data_types::f16, format::bfyx, weights_size });
+ set_values(weights_mem, weights_data_bfyx);
+
+ // Will be used to store reference values calculated in branches depending on bias
+ auto reference_result = VVVVF<FLOAT16>(batch_num, VVVF<FLOAT16>(output_f));
+
+ topology topology(
+ input_layout("input", input_mem.get_layout()),
+ data("weights_fsv", weights_mem));
+
+ // add input padding by X and Y
+ layout w_pad(data_types::f16, format::bfyx, input_size, padding({ 0,0,1,1 }, { 0, 0, 0, 0 }));
+ topology.add(reorder("input_fsv", "input", w_pad));
+
+ // Generate bias data
+ auto biases_size = tensor(1, output_f, 1, 1);
+ auto biases_data = generate_random_1d<FLOAT16>(output_f, -1, 1);
+ auto biases_mem = memory::allocate(engine, { data_types::f16, format::bfyx, biases_size });
+ set_values(biases_mem, biases_data);
+
+ // Calculate reference values
+ for (auto bi = 0; bi < batch_num; ++bi)
+ {
+ for (auto ofi = 0; ofi < output_f; ++ofi)
+ {
+ reference_result[bi][ofi] = reference_convolve(
+ input_data[bi], weights_data[ofi],
+ stride, stride,
+ biases_data[ofi],
+ 1, 1);
+ }
+ }
+
+ topology.add(data("biases_fsv", biases_mem));
+
+ auto conv_fsv = convolution("conv_fsv", "input_fsv", { "weights_fsv" }, { "biases_fsv" },
+ { 1, 1, stride, stride }, { 0, 0, 0, 0 });
+
+ topology.add(conv_fsv);
+
+ build_options options;
+ implementation_desc conv_impl = { format::fs_b_yx_fsv32, "convolution_gpu_bfyx_to_fs_byx_fsv32" };
+ options.set_option(build_option::force_implementations({ {"conv_fsv", conv_impl} }));
+ options.set_option(build_option::optimize_data(true));
+ network network(engine, topology, options);
+
+ network.set_input_data("input", input_mem);
+
+ network.execute();
+
+ auto out_mem = network.get_output("conv_fsv").get_memory();
+ auto out_ptr = out_mem.pointer<FLOAT16>();
+
+ ASSERT_EQ(out_mem.get_layout().format, format::fs_b_yx_fsv32);
+
+ for (int bi = 0; bi < batch_num; ++bi)
+ for (int fi = 0; fi < output_f; ++fi)
+ for (int yi = 0; yi < output_xy; ++yi)
+ for (int xi = 0; xi < output_xy; ++xi)
+ {
+ auto val_ref = reference_result[bi][fi][yi][xi];
+ auto val = out_ptr[(fi / 32) * batch_num * output_xy * output_xy * 32 +
+ bi * output_xy * output_xy * 32 +
+ yi * output_xy * 32 +
+ xi * 32 +
+ fi % 32];
+ auto equal = are_equal(val_ref, val, 1e-2f);
+ EXPECT_TRUE(equal);
+ if (!equal)
+ {
+ std::cout << "At b = " << bi << ", fi = " << fi << ", xi = " << xi << ", yi = " << yi << std::endl;
+ }
+ }
+}
+
using TestParamType_convolution_gpu_with_crop = ::testing::tuple<int, // 0 - Filter size
int, // 1 - Input size
int, // 2 - Input/output features
}
+
+TEST(convolution_gpu, bfyx_iyxo_5x5_fp16)
+{
+
+ const auto& engine = get_test_engine();
+
+ if (!engine.get_info().supports_fp16)
+ {
+ std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl;
+ EXPECT_EQ(1, 1);
+ return;
+ }
+
+ const int batch_num = 1;
+ const int output_f = 4;
+
+ const int input_f = 32;
+ const int filter_xy = 5;
+ const int stride = 1;
+ const int output_padding = 0;
+ const bool with_bias = false;
+ const int input_size_x = 64;
+ const int input_size_y = 20;
+
+
+ const int input_offset = -(filter_xy / 2);
+
+ const int output_x = 1 + (input_size_x + 2 * (-input_offset) - filter_xy) / stride + 2 * output_padding;
+
+ const int output_y = 1 + (input_size_y + 2 * (-input_offset) - filter_xy) / stride + 2 * output_padding;
+
+ auto input_size = tensor(batch_num, input_f, input_size_x, input_size_y);
+ auto input_data = generate_random_4d<FLOAT16>(batch_num, input_f, input_size_y, input_size_x, -1, 1);
+
+ auto input_data_bfyx = flatten_4d(format::bfyx, input_data);
+ auto input_mem = memory::allocate(engine, { data_types::f16, format::bfyx, input_size });
+ set_values(input_mem, input_data_bfyx);
+
+ auto weights_size = tensor(output_f, input_f, filter_xy, filter_xy);
+ auto weights_data = generate_random_4d<FLOAT16>(output_f, input_f, filter_xy, filter_xy, -1, 1);
+ auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);
+ auto weights_mem = memory::allocate(engine, { data_types::f16, format::bfyx, weights_size });
+
+ set_values(weights_mem, weights_data_bfyx);
+
+ // Will be used to store reference values calculated in branches depending on bias
+ auto reference_result = VVVVF<FLOAT16>(batch_num, VVVF<FLOAT16>(output_f));
+
+ topology topology(
+ input_layout("input", input_mem.get_layout()),
+ data("weights_fsv", weights_mem)
+ );
+
+ if (with_bias)
+ {
+ // Generate bias data
+ auto biases_size = tensor(1, output_f, 1, 1);
+ auto biases_data = generate_random_1d<FLOAT16>(output_f, -1, 1);
+ auto biases_mem = memory::allocate(engine, { data_types::f16, format::bfyx, biases_size });
+ set_values(biases_mem, biases_data);
+
+ // Calculate reference values with bias
+ for (auto bi = 0; bi < batch_num; ++bi)
+ {
+ for (auto ofi = 0; ofi < output_f; ++ofi)
+ {
+ reference_result[bi][ofi] = reference_convolve(
+ input_data[bi], weights_data[ofi],
+ stride, stride, biases_data[ofi],
+ 1, 1, // dilation
+ -input_offset, -input_offset, // input padding
+ output_padding, output_padding);
+ }
+ }
+
+ topology.add(data("biases_fsv", biases_mem));
+
+ auto conv_fsv = convolution("conv_fsv", "input", { "weights_fsv" }, { "biases_fsv" },
+ { 1, 1, stride, stride }, { 0, 0, input_offset, input_offset });
+ conv_fsv.output_padding = padding({ 0, 0, output_padding, output_padding }, 0.f);
+
+ topology.add(conv_fsv);
+ }
+ else
+ {
+
+ // Calculate reference values without bias
+ for (auto bi = 0; bi < batch_num; ++bi)
+ {
+ for (auto ofi = 0; ofi < output_f; ++ofi)
+ {
+ reference_result[bi][ofi] = reference_convolve(
+ input_data[bi], weights_data[ofi],
+ stride, stride,
+ 0, // bias
+ 1, 1, // dilation
+ -input_offset, -input_offset, // input padding
+ output_padding, output_padding);
+ }
+ }
+
+
+ auto conv_fsv = convolution("conv_fsv", "input", { "weights_fsv" },
+ { 1, 1, stride, stride }, { 0, 0, input_offset, input_offset });
+ conv_fsv.output_padding = padding({ 0, 0, output_padding, output_padding }, 0.f);
+
+ topology.add(conv_fsv);
+ }
+
+
+ build_options options;
+ implementation_desc conv_impl = { format::bfyx, "" };
+ options.set_option(build_option::optimize_data(true));
+ network network(engine, topology, options);
+
+ network.set_input_data("input", input_mem);
+
+ network.execute();
+
+ auto out_mem = network.get_output("conv_fsv").get_memory();
+ auto out_ptr = out_mem.pointer<FLOAT16>();
+
+ for (int bi = 0; bi < batch_num; ++bi)
+ for (int fi = 0; fi < output_f; ++fi)
+ for (int yi = 0; yi < output_y; ++yi)
+ for (int xi = 0; xi < output_x; ++xi)
+ {
+ auto val_ref = reference_result[bi][fi][yi][xi];
+ auto val = out_ptr[bi * output_f * output_x * output_y +
+ fi * output_y * output_x +
+ yi * output_x +
+ xi];
+ auto equal = are_equal(val_ref, val, 1e-2f);
+ EXPECT_TRUE(equal);
+ if (!equal)
+ {
+ std::cout << "At b = " << bi << ", fi = " << fi << ", xi = " << xi << ", yi = " << yi << std::endl;
+ }
+ }
+
+}
+
INSTANTIATE_TEST_CASE_P(convolution_gpu_block,
convolution_gpu_block_layout,
::testing::Values(
),
convolution_depthwise_gpu::PrintToStringParamName);
+INSTANTIATE_TEST_CASE_P(convolution_grouped_b_fs_yx_fsv4,
+ convolution_grouped_gpu,
+ ::testing::Values(
+ // Input X size, Input Y size, Input features, Output features, Kernel size X, Kernel size
+ // Y, Groups number, Stride, Output padding, Batch
+ TestParamType_grouped_convolution_gpu(4, 4, 16, 16, 3, 3, 4, 1, 1),
+ TestParamType_grouped_convolution_gpu(4, 4, 8, 4, 2, 2, 2, 1, 4),
+ TestParamType_grouped_convolution_gpu(8, 8, 16, 16, 4, 4, 4, 1, 1),
+ TestParamType_grouped_convolution_gpu(17, 17, 32, 96, 3, 3, 2, 2, 2),
+ TestParamType_grouped_convolution_gpu(16, 16, 8, 48, 2, 2, 2, 2, 1),
+ TestParamType_grouped_convolution_gpu(3, 3, 48, 96, 2, 2, 2, 8, 1),
+ TestParamType_grouped_convolution_gpu(6, 6, 8, 26, 3, 3, 2, 4, 1)),
+ convolution_grouped_gpu::PrintToStringParamName);
+
+TEST_P(convolution_grouped_gpu, grouped_b_fs_yx_fsv4) {
+ const auto& engine = get_test_engine();
+
+ const int input_x = testing::get<0>(GetParam()),
+ input_y = testing::get<1>(GetParam()),
+ input_f = testing::get<2>(GetParam()),
+ output_f = testing::get<3>(GetParam()),
+ filter_x = testing::get<4>(GetParam()),
+ filter_y = testing::get<5>(GetParam()),
+ groups = testing::get<6>(GetParam()),
+ stride = testing::get<7>(GetParam()),
+ batch_num = testing::get<8>(GetParam()),
+ output_padding = 0,
+ input_offset_y = (filter_x - 1) / 2,
+ input_offset_x = (filter_y - 1) / 2;
+
+ auto input_size = tensor(batch(batch_num), feature(input_f), spatial(input_x, input_y));
+ auto input_rnd = generate_random_4d<uint8_t>(batch_num, input_f, input_y, input_x, 0, 255);
+ auto input_rnd_vec = flatten_4d<uint8_t>(format::bfyx, input_rnd);
+ auto input = memory::allocate(engine, {data_types::u8, format::bfyx, input_size});
+ set_values(input, input_rnd_vec);
+
+ auto weights_size = tensor(group(groups), batch(output_f / groups), feature(input_f / groups), spatial(filter_x, filter_y));
+ VVVVVF<int8_t> weights_rnd = generate_random_5d<int8_t>(groups, output_f / groups, input_f / groups, filter_y, filter_x, -127, 127);
+ auto weights_lay = layout(data_types::i8, format::goiyx, weights_size);
+
+ std::vector<int8_t> weights_flat(weights_lay.get_linear_size());
+ for (int gi = 0; gi < groups; ++gi)
+ for (int ofi = 0; ofi < output_f / groups; ++ofi)
+ for (int ifi = 0; ifi < input_f / groups; ++ifi)
+ for (int kyi = 0; kyi < filter_y; ++kyi)
+ for (int kxi = 0; kxi < filter_x; ++kxi) {
+ tensor coords = tensor(group(gi), batch(ofi), feature(ifi), spatial(kxi, kyi, 0, 0));
+ size_t offset = weights_lay.get_linear_offset(coords);
+ weights_flat[offset] = weights_rnd[gi][ofi][ifi][kyi][kxi];
+ }
+ auto weights = memory::allocate(engine, {data_types::i8, format::goiyx, weights_size});
+ set_values(weights, weights_flat);
+
+ VVVVF<float> expected_result(batch_num, VVVF<float>(output_f));
+
+ // Calculate reference values without bias
+ for (int bi = 0; bi < batch_num; ++bi)
+ for (int gi = 0; gi < groups; ++gi)
+ for (int ofi = 0; ofi < (int)weights_rnd[0].size(); ++ofi) {
+ bool grouped = groups > 1;
+ int f_begin = gi * input_f / groups;
+ int f_end = gi * input_f / groups + input_f / groups;
+
+ expected_result[bi][ofi + gi * output_f / groups] = reference_convolve<uint8_t, float, int8_t>(
+ input_rnd[bi], weights_rnd[gi][ofi], // input, weights
+ stride, stride, // strides
+ 0, // bias
+ 1, 1, // dilation
+ input_offset_y, input_offset_x, // input padding
+ 0, 0, // output_padding
+ f_begin, f_end, // f_begin, f_end
+ false, // depthwise
+ grouped); // grouped
+ }
+
+ topology topology(input_layout("input", input.get_layout()),
+ data("weights", weights),
+ reorder("input_fsv", "input", {data_types::u8, format::b_fs_yx_fsv4, input_size}),
+ convolution("conv",
+ "input_fsv",
+ {"weights"},
+ groups,
+ {1, 1, stride, stride},
+ {0, 0, -input_offset_x, -input_offset_y},
+ {1, 1, 1, 1},
+ padding({0, 0, output_padding, output_padding}, 0.f)));
+
+ build_options options;
+ options.set_option(build_option::optimize_data(true));
+ implementation_desc conv_impl = {format::b_fs_yx_fsv4, "fused_conv_eltwise_gpu_imad"};
+ options.set_option(build_option::force_implementations({{"conv", conv_impl}}));
+
+ network network(engine, topology, options);
+ network.set_input_data("input", input);
+ network.execute();
+
+ auto out_mem = network.get_output("conv").get_memory();
+ auto out_ptr = out_mem.pointer<float>();
+ auto out_lay = out_mem.get_layout();
+
+ ASSERT_EQ(out_mem.get_layout().format, format::b_fs_yx_fsv4);
+ ASSERT_EQ(out_lay.size.batch[0], expected_result.size());
+ ASSERT_EQ(out_lay.size.feature[0], expected_result[0].size());
+ ASSERT_EQ(out_lay.size.spatial[1], expected_result[0][0].size());
+ ASSERT_EQ(out_lay.size.spatial[0], expected_result[0][0][0].size());
+
+ for (int bi = 0; bi < batch_num; ++bi)
+ for (int ofi = 0; ofi < output_f; ++ofi)
+ for (int yi = 0; yi < (int)expected_result[0][0].size(); ++yi)
+ for (int xi = 0; xi < (int)expected_result[0][0][0].size(); ++xi) {
+ tensor coords = tensor(batch(bi), feature(ofi), spatial(xi, yi, 0, 0));
+ auto offset = out_lay.get_linear_offset(coords);
+ auto val = out_ptr[offset];
+ auto val_ref = expected_result[bi][ofi][yi][xi];
+ auto equal = are_equal(val_ref, val, 1e-2f);
+ if (!equal) {
+ std::cout << "Value at batch: " << bi << ", output_f: " << ofi << ", y: " << yi << ", x: " << xi << " = " << val << std::endl;
+ std::cout << "Reference value at batch: " << bi << ", output_f: " << ofi << ", y: " << yi << ", x: " << xi << " = " << val_ref << std::endl;
+ }
+ EXPECT_TRUE(equal);
+ }
+}
+
template <typename InputT, typename WeightsT, typename OutputT>
-class convolution_test_base : public testing::Test {
+class convolution_test_base {
public:
virtual topology build_topology(const cldnn::engine& engine) {
auto input_lay = layout(input_type(), input_format(), input_size());
auto topo = topology();
topo.add(input_layout("input", input_lay));
+ std::string input_id = "input";
+ if (has_input_zp()) {
+ auto input_zp_lay = layout(input_type(), format::bfyx, tensor(feature(input_features())));
+ auto input_zp_mem = memory::allocate(engine, input_zp_lay);
+ set_values(input_zp_mem, _input_zp);
+ topo.add(data("input_zp", input_zp_mem));
+ topo.add(eltwise("input_asymm", { "input", "input_zp" }, eltwise_mode::sub));
+ input_id = "input_asymm";
+ }
topo.add(data("weights", wei_mem));
+ std::string weights_id = "weights";
+ if (has_weights_zp()) {
+ auto weights_zp_lay = layout(weights_type(), format::bfyx, tensor(batch(output_features())));
+ auto weights_zp_mem = memory::allocate(engine, weights_zp_lay);
+ set_values(weights_zp_mem, _weights_zp);
+ topo.add(data("weights_zp", weights_zp_mem));
+ topo.add(eltwise("weights_asymm", { "weights", "weights_zp" }, eltwise_mode::sub));
+ weights_id = "weights_asymm";
+ }
if (!has_bias()) {
auto conv_prim = convolution(
"conv",
- "input",
- { "weights" },
+ input_id,
+ { weights_id },
static_cast<uint32_t>(groups()),
tensor(batch(0), feature(0), spatial(_stride_x, _stride_y)),
tensor(batch(0), feature(0), spatial(_offset_x, _offset_y)),
topo.add(data("bias", bias_mem));
auto conv_prim = convolution(
"conv",
- "input",
- { "weights" },
+ input_id,
+ { weights_id },
{ "bias" },
static_cast<uint32_t>(groups()),
tensor(batch(0), feature(0), spatial(_stride_x, _stride_y)),
auto topo = build_topology(engine);
auto build_opts = build_options(
- build_option::optimize_data(true)
+ build_option::optimize_data(true),
+ build_option::force_implementations({ {"conv", {input_format(), ""}} })
);
auto prog = program(engine, topo, build_opts);
auto out_lay = out_mem.get_layout();
auto out_ptr = out_mem.cldnn::memory::template pointer<OutputT>();
+ std::stringstream description;
+ for (auto i : net.get_primitives_info()) {
+ if (i.original_id == "conv") {
+ description << " kernel: " << i.kernel_id << std::endl;
+ }
+ }
+ description << " executed: ";
+ for (auto e : net.get_executed_primitive_ids()) {
+ description << e << ", ";
+ }
+
ASSERT_EQ(out_lay.data_type, output_type());
ASSERT_EQ(out_lay.size.batch[0], expected.size());
ASSERT_EQ(out_lay.size.feature[0], expected[0].size());
tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
size_t offset = out_lay.get_linear_offset(coords);
- EXPECT_EQ(out_ptr[offset], expected[bi][fi][yi][xi])
- << "at b= " << bi << ", f= " << fi << ", y= " << yi << ", x= " << xi;
+ ASSERT_EQ(out_ptr[offset], expected[bi][fi][yi][xi])
+ << "at b= " << bi << ", f= " << fi << ", y= " << yi << ", x= " << xi << std::endl
+ << description.str();
}
}
_dilation_y = dilation_y;
}
+ void set_input_zp(VF<InputT> input_zp) {
+ _input_zp = std::move(input_zp);
+ }
+
+ void set_weights_zp(VF<WeightsT> weights_zp) {
+ _weights_zp = std::move(weights_zp);
+ }
+
protected:
VVVVF<InputT> _input;
VVVVF<WeightsT> _weights;
VF<OutputT> _bias;
+ VF<InputT> _input_zp;
+ VF<WeightsT> _weights_zp;
format::type _input_fmt;
int _stride_x, _stride_y;
int _offset_x, _offset_y;
size_t groups() const { return input_features() / weights_input_features(); }
bool has_bias() { return _bias.size() > 0; }
+ bool has_input_zp() { return _input_zp.size() > 0; }
+ bool has_weights_zp() { return _weights_zp.size() > 0; }
data_types input_type() const { return type_to_data_type<InputT>::value; }
format input_format() const { return _input_fmt; }
bool with_bias;
size_t groups;
format::type input_format;
+ bool asymmetric_weights;
+ bool asymmetric_data;
};
-using convolution_random_test_params = std::tuple<
- size_t, // batch
- size_t, // input features
- size_t, // output features
- std::tuple<size_t, size_t>, // input x, y
- std::tuple<size_t, size_t>, // filter x, y
- std::tuple<int, int>, // stride x, y
- std::tuple<int, int>, // offset x, y
- std::tuple<int, int>, // dilation x, y
- bool, // with bias
- format::type // input format
->;
-
-static convolution_random_test_all_params convert_random_test_params(const convolution_random_test_params& params) {
- convolution_random_test_all_params all_params;
- std::forward_as_tuple(
- all_params.batch,
- all_params.input_features,
- all_params.output_features,
- std::forward_as_tuple(all_params.input_xy[0], all_params.input_xy[1]),
- std::forward_as_tuple(all_params.filter_xy[0], all_params.filter_xy[1]),
- std::forward_as_tuple(all_params.stride_xy[0], all_params.stride_xy[1]),
- std::forward_as_tuple(all_params.offset_xy[0], all_params.offset_xy[1]),
- std::forward_as_tuple(all_params.dilation_xy[0], all_params.dilation_xy[1]),
- all_params.with_bias,
- all_params.input_format) = params;
- all_params.groups = 1;
- return all_params;
-}
-
-using convolution_random_test_depthwise_params = std::tuple<
- size_t, // batch
- size_t, // input/output features
- std::tuple<size_t, size_t>, // input x, y
- std::tuple<size_t, size_t>, // filter x, y
- std::tuple<int, int>, // stride x, y
- std::tuple<int, int>, // offset x, y
- std::tuple<int, int>, // dilation x, y
- bool, // with bias
- format::type // input format
->;
-
-static convolution_random_test_all_params convert_random_test_params(const convolution_random_test_depthwise_params& params) {
- convolution_random_test_all_params all_params;
- std::forward_as_tuple(
- all_params.batch,
- all_params.input_features,
- std::forward_as_tuple(all_params.input_xy[0], all_params.input_xy[1]),
- std::forward_as_tuple(all_params.filter_xy[0], all_params.filter_xy[1]),
- std::forward_as_tuple(all_params.stride_xy[0], all_params.stride_xy[1]),
- std::forward_as_tuple(all_params.offset_xy[0], all_params.offset_xy[1]),
- std::forward_as_tuple(all_params.dilation_xy[0], all_params.dilation_xy[1]),
- all_params.with_bias,
- all_params.input_format) = params;
- all_params.groups = all_params.input_features;
- all_params.output_features = all_params.input_features;
- return all_params;
-}
-
template <typename InputT, typename WeightsT, typename OutputT>
class convolution_random_test_base : public convolution_test_base<InputT, WeightsT, OutputT> {
public:
virtual VVVVF<OutputT> calculate_reference() {
VVVVF<OutputT> expected = VVVVF<OutputT>(this->batch_num(), VVVF<OutputT>(this->output_features()));
bool depthwise = this->groups() == this->input_features();
+ bool grouped = (this->groups() > 1 && !depthwise) ? true : false;
for (size_t bi = 0; bi < this->batch_num(); ++bi)
for (size_t fi = 0; fi < this->output_features(); ++fi) {
size_t f_begin = depthwise ? fi : 0;
size_t f_end = (depthwise ? fi : 0) + this->weights_input_features();
auto bias = this->has_bias() ? this->_bias[fi] : static_cast<OutputT>(0);
+ auto weights_zp = this->has_weights_zp() ? this->_weights_zp[fi] : static_cast<WeightsT>(0);
expected[bi][fi] = reference_convolve<InputT, OutputT, WeightsT>(
this->_input[bi],
this->_weights[fi],
0,
f_begin,
f_end,
- depthwise);
+ depthwise,
+ grouped,
+ this->_input_zp,
+ weights_zp);
}
return expected;
}
auto weights_data = generate_random_4d<WeightsT>(
params.output_features, wei_in_f, params.filter_xy[1], params.filter_xy[0], -256, 256);
auto bias_data = params.with_bias ? generate_random_1d<OutputT>(params.output_features, -256, 256) : VF<OutputT>();
+ auto weights_zp_data = params.asymmetric_weights ? generate_random_1d<WeightsT>(params.output_features, -256, 256) : VF<WeightsT>();
+ auto input_zp_data = params.asymmetric_data ? generate_random_1d<InputT>(params.input_features, -256, 256) : VF<InputT>();
this->set_input(params.input_format, std::move(input_data));
this->set_weights(std::move(weights_data));
this->set_strides(params.stride_xy[0], params.stride_xy[1]);
this->set_offsets(params.offset_xy[0], params.offset_xy[1]);
this->set_dilation(params.dilation_xy[0], params.dilation_xy[1]);
+ this->set_weights_zp(std::move(weights_zp_data));
+ this->set_input_zp(std::move(input_zp_data));
}
void run_random(const convolution_random_test_all_params& params) {
// construct a readable name in format as follows:
// <out format>_i<input>_w<weights>_s<stride>_ofs<offset>_d<dilation>_g<groups>_<bias>
-static std::string to_string_convolution_all_params(const convolution_random_test_all_params& params) {
+static std::string to_string_convolution_all_params(const testing::TestParamInfo<convolution_random_test_all_params>& param_info) {
+ auto& params = param_info.param;
int Batch = (int)params.batch;
int iF = (int)params.input_features;
int oF = (int)params.output_features;
auto groups = params.groups;
bool Bias = params.with_bias;
format::type iType = params.input_format; // input format
+ bool asymm_weights = params.asymmetric_weights;
+ bool asymm_input = params.asymmetric_data;
// Wrapper for negative walues as ex. "-1" will generate invalid gtest param string
auto to_string_neg = [](int val) {
if (val >= 0)
"_ofs" + to_string_neg(Offset[0]) + 'x' + to_string_neg(Offset[1]) +
"_d" + std::to_string(Dilation[0]) + 'x' + std::to_string(Dilation[1]) +
"_g" + std::to_string(groups) +
- (Bias ? "_bias" : "");
-}
-
-template <typename T>
-std::string to_string_convolution_random_params(testing::TestParamInfo<T> param_info) {
- return to_string_convolution_all_params(convert_random_test_params(param_info.param));
+ (Bias ? "_bias" : "") + (asymm_weights ? "_wzp" : "") + (asymm_input ? "_izp" : "");
}
template <typename InputT, typename WeightsT, typename OutputT>
-class convolution_random_test : public convolution_random_test_base<InputT, WeightsT, OutputT>
- , public testing::WithParamInterface<convolution_random_test_params> {};
-
-
-using convolution_random_test_s8s8f32 = convolution_random_test<int8_t, int8_t, float>;
-using convolution_random_test_u8s8f32 = convolution_random_test<uint8_t, int8_t, float>;
-
-TEST_P(convolution_random_test_s8s8f32, random) {
- ASSERT_NO_FATAL_FAILURE(run_random(convert_random_test_params(GetParam())));
-}
-
-TEST_P(convolution_random_test_u8s8f32, random) {
- ASSERT_NO_FATAL_FAILURE(run_random(convert_random_test_params(GetParam())));
-}
-
-INSTANTIATE_TEST_CASE_P(
- b_fs_yx_fsv4,
- convolution_random_test_s8s8f32,
- testing::Combine(
- testing::Values(1, 2), // batch
- testing::Values(3, 32), // input features
- testing::Values(16, 32), // output features
- testing::Values(std::pair<size_t, size_t>(7, 7), std::pair<size_t, size_t>(8, 8)), // input x, y
- testing::Values(std::pair<size_t, size_t>(1, 1), std::pair<size_t, size_t>(3, 3)), // filter x, y
- testing::Values(std::pair<int, int>(1, 1), std::pair<int, int>(2, 2)), // strides x, y
- testing::Values(std::pair<int, int>(0, 0)), // offsets x, y
- testing::Values(std::pair<int, int>(1, 1)), // dilation x, y
- testing::Values(false, true), // bias
- testing::Values(format::b_fs_yx_fsv4) // input format
- ),
- to_string_convolution_random_params<convolution_random_test_params>);
-
-INSTANTIATE_TEST_CASE_P(
- b_fs_yx_fsv4,
- convolution_random_test_u8s8f32,
- testing::Combine(
- testing::Values(1, 2), // batch
- testing::Values(3, 32), // input features
- testing::Values(16, 32), // output features
- testing::Values(std::pair<size_t, size_t>(7, 7), std::pair<size_t, size_t>(8, 8)), // input x, y
- testing::Values(std::pair<size_t, size_t>(1, 1), std::pair<size_t, size_t>(3, 3)), // filter x, y
- testing::Values(std::pair<int, int>(1, 1), std::pair<int, int>(2, 2)), // strides x, y
- testing::Values(std::pair<int, int>(0, 0)), // offsets x, y
- testing::Values(std::pair<int, int>(1, 1)), // dilation x, y
- testing::Values(false, true), // bias
- testing::Values(format::b_fs_yx_fsv4) // input format
- ),
- to_string_convolution_random_params<convolution_random_test_params>);
-
-INSTANTIATE_TEST_CASE_P(
- b_fs_yx_fsv4_1x1_lwg_opt,
- convolution_random_test_s8s8f32,
- testing::Combine(
- testing::Values(1), // batch
- testing::Values(128, 256, 512), // input features
- testing::Values(64), // output features
- testing::Values(std::pair<size_t, size_t>(3, 3)), // input x, y
- testing::Values(std::pair<size_t, size_t>(1, 1)), // filter x, y
- testing::Values(std::pair<int, int>(1, 1)), // strides x, y
- testing::Values(std::pair<int, int>(0, 0)), // offsets x, y
- testing::Values(std::pair<int, int>(1, 1)), // dilation x, y
- testing::Values(false), // bias
- testing::Values(format::b_fs_yx_fsv4) // input format
- ),
- to_string_convolution_random_params<convolution_random_test_params>);
-
-template <typename InputT, typename WeightsT, typename OutputT>
-class convolution_random_dw_test : public convolution_random_test_base<InputT, WeightsT, OutputT>
- , public testing::WithParamInterface<convolution_random_test_depthwise_params> {};
-
-using convolution_random_test_dw_s8s8f32 = convolution_random_dw_test<int8_t, int8_t, float>;
-using convolution_random_test_dw_u8s8f32 = convolution_random_dw_test<uint8_t, int8_t, float>;
-
-TEST_P(convolution_random_test_dw_s8s8f32, random) {
- ASSERT_NO_FATAL_FAILURE(run_random(convert_random_test_params(GetParam())));
-}
-
-TEST_P(convolution_random_test_dw_u8s8f32, random) {
- ASSERT_NO_FATAL_FAILURE(run_random(convert_random_test_params(GetParam())));
-}
-
-INSTANTIATE_TEST_CASE_P(
- b_fs_yx_fsv4,
- convolution_random_test_dw_s8s8f32,
- testing::Combine(
- testing::Values(1, 2), // batch
- testing::Values(3, 32), // input/output features
- testing::Values(std::pair<size_t, size_t>(7, 7), std::pair<size_t, size_t>(8, 8)), // input x, y
- testing::Values(std::pair<size_t, size_t>(1, 1), std::pair<size_t, size_t>(3, 3)), // filter x, y
- testing::Values(std::pair<int, int>(1, 1), std::pair<int, int>(2, 2)), // strides x, y
- testing::Values(std::pair<int, int>(0, 0), std::pair<int, int>(-1, -1)), // offsets x, y
- testing::Values(std::pair<int, int>(1, 1)), // dilation x, y
- testing::Values(false, true), // bias
- testing::Values(format::b_fs_yx_fsv4) // input format
- ),
- to_string_convolution_random_params<convolution_random_test_depthwise_params>);
-
-INSTANTIATE_TEST_CASE_P(
- b_fs_yx_fsv4,
- convolution_random_test_dw_u8s8f32,
- testing::Combine(
- testing::Values(1, 2), // batch
- testing::Values(3, 32), // input/output features
- testing::Values(std::pair<size_t, size_t>(7, 7), std::pair<size_t, size_t>(8, 8)), // input x, y
- testing::Values(std::pair<size_t, size_t>(1, 1), std::pair<size_t, size_t>(3, 3)), // filter x, y
- testing::Values(std::pair<int, int>(1, 1), std::pair<int, int>(2, 2)), // strides x, y
- testing::Values(std::pair<int, int>(0, 0), std::pair<int, int>(-1, -1)), // offsets x, y
- testing::Values(std::pair<int, int>(1, 1)), // dilation x, y
- testing::Values(false, true), // bias
- testing::Values(format::b_fs_yx_fsv4) // input format
- ),
- to_string_convolution_random_params<convolution_random_test_depthwise_params>);
-
-INSTANTIATE_TEST_CASE_P(
- special_cases,
- convolution_random_test_dw_u8s8f32,
- testing::Values(
- convolution_random_test_depthwise_params(
- 1, 32, std::pair<size_t, size_t>(28, 28), std::pair<size_t, size_t>(3, 3),
- std::pair<int, int>(1, 1), std::pair<int, int>(-1, -1), std::pair<int, int>(1, 1), true, format::b_fs_yx_fsv4)
- ),
- to_string_convolution_random_params<convolution_random_test_depthwise_params>);
-
-template <typename InputT, typename WeightsT, typename OutputT>
-class convolution_scale_random_test : public convolution_random_test<InputT, WeightsT, OutputT> {
+class convolution_scale_random_test : public convolution_random_test_base<InputT, WeightsT, OutputT> {
public:
- using parent = convolution_random_test<InputT, WeightsT, OutputT>;
+ using parent = convolution_random_test_base<InputT, WeightsT, OutputT>;
virtual primitive_id output_primitive_id() const {
return "scale_wa_reorder";
auto expected = parent::calculate_reference();
for (size_t bi = 0; bi < this->batch_num(); ++bi)
- for (size_t fi = 0; fi < this->output_features(); ++fi) {
- expected[bi][fi] = reference_scale_post_op<OutputT>(expected[bi][fi], _scale[fi], _shift[fi]);
- }
+ for (size_t fi = 0; fi < this->output_features(); ++fi) {
+ expected[bi][fi] = reference_scale_post_op<OutputT>(expected[bi][fi], _scale[fi], _shift[fi]);
+ }
return expected;
}
VF<OutputT> _shift;
};
-using convolution_scale_random_test_s8s8f32 = convolution_scale_random_test<int8_t, int8_t, float>;
-using convolution_scale_random_test_u8s8f32 = convolution_scale_random_test<uint8_t, int8_t, float>;
-
-TEST_P(convolution_scale_random_test_s8s8f32, random) {
- ASSERT_NO_FATAL_FAILURE(run_random(convert_random_test_params(GetParam())));
-}
-
-TEST_P(convolution_scale_random_test_u8s8f32, random) {
- ASSERT_NO_FATAL_FAILURE(run_random(convert_random_test_params(GetParam())));
-}
-
-INSTANTIATE_TEST_CASE_P(
- b_fs_yx_fsv4,
- convolution_scale_random_test_s8s8f32,
- testing::Combine(
- testing::Values(1, 2), // batch
- testing::Values(3, 32), // input features
- testing::Values(16, 32), // output features
- testing::Values(std::pair<size_t, size_t>(7, 7), std::pair<size_t, size_t>(8, 8)), // input x, y
- testing::Values(std::pair<size_t, size_t>(1, 1), std::pair<size_t, size_t>(3, 3)), // filter x, y
- testing::Values(std::pair<int, int>(1, 1), std::pair<int, int>(2, 2)), // strides x, y
- testing::Values(std::pair<int, int>(0, 0)), // offsets x, y
- testing::Values(std::pair<int, int>(1, 1)), // dilation x, y
- testing::Values(false, true), // bias
- testing::Values(format::b_fs_yx_fsv4) // input format
- ),
- to_string_convolution_random_params<convolution_random_test_params>);
-
-INSTANTIATE_TEST_CASE_P(
- b_fs_yx_fsv4,
- convolution_scale_random_test_u8s8f32,
- testing::Combine(
- testing::Values(1, 2), // batch
- testing::Values(3, 32), // input features
- testing::Values(16, 32), // output features
- testing::Values(std::pair<size_t, size_t>(7, 7), std::pair<size_t, size_t>(8, 8)), // input x, y
- testing::Values(std::pair<size_t, size_t>(1, 1), std::pair<size_t, size_t>(3, 3)), // filter x, y
- testing::Values(std::pair<int, int>(1, 1), std::pair<int, int>(2, 2)), // strides x, y
- testing::Values(std::pair<int, int>(0, 0)), // offsets x, y
- testing::Values(std::pair<int, int>(1, 1)), // dilation x, y
- testing::Values(false, true), // bias
- testing::Values(format::b_fs_yx_fsv4) // input format
- ),
- to_string_convolution_random_params<convolution_random_test_params>);
-
-template <typename InputT, typename WeightsT, typename OutputT>
-class convolution_asymm_weights_data_random_test : public convolution_random_test<InputT, WeightsT, OutputT> {
- using parent = convolution_random_test<InputT, WeightsT, OutputT>;
-
- virtual primitive_id output_primitive_id() const {
- return "conv_wa_reorder";
- }
+class convolution_random_smoke_test : public testing::TestWithParam<convolution_random_test_all_params> {};
- topology build_topology(const cldnn::engine& engine) override {
- auto input_lay = layout(this->input_type(), this->input_format(), this->input_size());
- auto wei_lay = layout(this->weights_type(), format::bfyx, this->weights_size());
- auto data_zp_lay = layout(this->input_type(), format::bfyx, tensor(batch(1), feature(this->input_features()), spatial(1, 1)));
- auto wei_zp_lay = layout(this->weights_type(), format::bfyx, tensor(batch(this->output_features()), feature(1), spatial(1, 1)));
+using convolution_random_test_s8s8f32 = convolution_random_test_base<int8_t, int8_t, float>;
+using convolution_random_test_u8s8f32 = convolution_random_test_base<uint8_t, int8_t, float>;
- auto wei_mem = memory::allocate(engine, wei_lay);
- auto data_zp_mem = memory::allocate(engine, data_zp_lay);
- auto wei_zp_mem = memory::allocate(engine, wei_zp_lay);
- auto weights_flat = flatten_4d(format::bfyx, this->_weights);
- set_values(wei_mem, weights_flat);
- set_values(data_zp_mem, _data_zp);
- set_values(wei_zp_mem, _weights_zp);
+using convolution_scale_random_test_s8s8f32 = convolution_scale_random_test<int8_t, int8_t, float>;
+using convolution_scale_random_test_u8s8f32 = convolution_scale_random_test<uint8_t, int8_t, float>;
- auto topo = topology();
- topo.add(input_layout("input", input_lay));
- topo.add(data("weights", wei_mem));
- topo.add(data("data_zp", data_zp_mem));
- topo.add(data("weights_zp", wei_zp_mem));
- auto input_asymm_prim = eltwise("input_asymm", "input", "data_zp", eltwise_mode::sub);
- auto weights_asymm_prim = eltwise("weights_asymm", "weights", "weights_zp", eltwise_mode::sub);
- input_asymm_prim.output_data_type = data_types::f32;
- weights_asymm_prim.output_data_type = data_types::f32;
- topo.add(input_asymm_prim);
- topo.add(weights_asymm_prim);
- if (!this->has_bias()) {
- auto conv_prim = convolution(
- "conv",
- "input_asymm",
- { "weights_asymm" },
- tensor(batch(0), feature(0), spatial(this->_stride_x, this->_stride_y)),
- tensor(batch(0), feature(0), spatial(this->_offset_x, this->_offset_y)),
- tensor(batch(0), feature(0), spatial(this->_dilation_x, this->_dilation_y)));
- conv_prim.output_data_type = this->output_type();
- topo.add(conv_prim);
- } else {
- auto bias_lay = layout(this->output_type(), format::bfyx, tensor(feature(this->output_features())));
- auto bias_mem = memory::allocate(engine, bias_lay);
- set_values(bias_mem, this->_bias);
- topo.add(data("bias", bias_mem));
- auto conv_prim = convolution(
- "conv",
- "input_asymm",
- { "weights_asymm" },
- { "bias" },
- tensor(batch(0), feature(0), spatial(this->_stride_x, this->_stride_y)),
- tensor(batch(0), feature(0), spatial(this->_offset_x, this->_offset_y)),
- tensor(batch(0), feature(0), spatial(this->_dilation_x, this->_dilation_y)));
- conv_prim.output_data_type = this->output_type();
- topo.add(conv_prim);
+struct params_generator : std::vector<convolution_random_test_all_params> {
+ params_generator& smoke_test_params(format::type input_format, bool asymm_weights = false, bool asymm_data = false) {
+ std::vector<size_t> batches = { 1, 2 };
+ for (auto b : batches) {
+ // 7x7
+ push_back(convolution_random_test_all_params{
+ b, 3, 32, { 28, 28 }, { 7, 7 }, { 2, 2 }, { -3, -3 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data });
+ // 3x3
+ push_back(convolution_random_test_all_params{
+ b, 32, 48, { 14, 14 }, { 3, 3 }, { 1, 1 }, { -1, -1 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data });
+ push_back(convolution_random_test_all_params{
+ b, 32, 48, { 14, 14 }, { 3, 3 }, { 2, 2 }, { -1, -1 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data });
+ // 1x1
+ push_back(convolution_random_test_all_params{
+ b, 32, 48, { 28, 28 }, { 1, 1 }, { 1, 1 }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data });
+ push_back(convolution_random_test_all_params{
+ b, 32, 48, { 28, 28 }, { 1, 1 }, { 2, 2 }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data });
+ // 5x5
+ push_back(convolution_random_test_all_params{
+ b, 32, 48, { 28, 28 }, { 5, 5 }, { 1, 1 }, { -2, -2 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data });
+ push_back(convolution_random_test_all_params{
+ b, 32, 48, { 28, 28 }, { 5, 5 }, { 2, 2 }, { -2, -2 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data });
+ // depthwise
+ push_back(convolution_random_test_all_params{
+ b, 64, 64, { 19, 19 }, { 3, 3 }, { 1, 1 }, { -1, -1 }, { 1, 1 }, true, 64, input_format, asymm_weights, asymm_data });
+ push_back(convolution_random_test_all_params{
+ b, 64, 64, { 19, 19 }, { 3, 3 }, { 2, 2 }, { -1, -1 }, { 1, 1 }, true, 64, input_format, asymm_weights, asymm_data });
+ // dilation
+ push_back(convolution_random_test_all_params{
+ b, 32, 24, { 19, 19 }, { 3, 3 }, { 1, 1 }, { -1, -1 }, { 2, 2 }, true, 1, input_format, asymm_weights, asymm_data });
+ push_back(convolution_random_test_all_params{
+ b, 32, 24, { 19, 19 }, { 3, 3 }, { 2, 2 }, { -1, -1 }, { 2, 2 }, true, 1, input_format, asymm_weights, asymm_data });
}
- topo.add(reorder("conv_wa_reorder", "conv", format::bfyx, this->output_type()));
-
- return topo;
+ return *this;
+ }
+
+ params_generator& extra_test_params(format::type input_format, bool asymm_weights = false, bool asymm_data = false) {
+ std::vector<size_t> batches = { 1, 2 };
+ for (auto b : batches) {
+ // 1x1
+ push_back(convolution_random_test_all_params{
+ b, 23, 41, { 19, 19 }, { 1, 1 }, { 1, 1 }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data });
+ push_back(convolution_random_test_all_params{
+ b, 23, 41, { 19, 19 }, { 1, 1 }, { 2, 2 }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data });
+ // 3x3
+ push_back(convolution_random_test_all_params{
+ b, 16, 28, { 14, 14 }, { 3, 3 }, { 1, 1 }, { -1, -1 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data });
+ push_back(convolution_random_test_all_params{
+ b, 23, 41, { 19, 17 }, { 3, 3 }, { 1, 1 }, { -1, -1 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data });
+ // 5x5
+ push_back(convolution_random_test_all_params{
+ b, 16, 28, { 14, 14 }, { 5, 5 }, { 1, 1 }, { -2, -2 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data });
+ push_back(convolution_random_test_all_params{
+ b, 23, 41, { 19, 17 }, { 5, 5 }, { 1, 1 }, { -2, -2 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data });
+ }
+ return *this;
}
- VVVVF<OutputT> calculate_reference() override {
- VVVVF<OutputT> expected = VVVVF<OutputT>(this->batch_num(), VVVF<OutputT>(this->output_features()));
- for (size_t bi = 0; bi < this->batch_num(); ++bi)
- for (size_t fi = 0; fi < this->output_features(); ++fi) {
- auto bias = this->has_bias() ? this->_bias[fi] : static_cast<OutputT>(0);
- expected[bi][fi] = reference_convolve<InputT, OutputT, WeightsT>(
- this->_input[bi],
- this->_weights[fi],
- this->_stride_y,
- this->_stride_x,
- static_cast<float>(bias),
- this->_dilation_y,
- this->_dilation_x,
- this->_offset_y,
- this->_offset_x,
- 0,
- 0,
- 0,
- 0,
- false,
- _data_zp,
- _weights_zp[fi]);
- }
- return expected;
+ params_generator& all_test_params(format::type input_format, bool asymm_weights = false, bool asymm_data = false) {
+ return smoke_test_params(input_format, asymm_weights, asymm_data)
+ .extra_test_params(input_format, asymm_weights, asymm_data);
}
- void param_set_up(const convolution_random_test_all_params& params) override {
- parent::param_set_up(params);
-
- _data_zp = generate_random_1d<InputT>(this->input_features(), -128, 128);
- _weights_zp = generate_random_1d<WeightsT>(this->output_features(), -128, 128);
+ params_generator& add(convolution_random_test_all_params params) {
+ push_back(params);
+ return *this;
}
-
-protected:
- VF<InputT> _data_zp;
- VF<WeightsT> _weights_zp;
};
-using convolution_asymm_random_test_s8s8f32 = convolution_asymm_weights_data_random_test<int8_t, int8_t, float>;
-using convolution_asymm_random_test_u8s8f32 = convolution_asymm_weights_data_random_test<uint8_t, int8_t, float>;
-
-TEST_P(convolution_asymm_random_test_s8s8f32, random) {
- ASSERT_NO_FATAL_FAILURE(run_random(convert_random_test_params(GetParam())));
+TEST_P(convolution_random_smoke_test, u8s8f32) {
+ convolution_random_test_u8s8f32 test;
+ ASSERT_NO_FATAL_FAILURE(test.run_random(GetParam()));
}
-TEST_P(convolution_asymm_random_test_u8s8f32, random) {
- ASSERT_NO_FATAL_FAILURE(run_random(convert_random_test_params(GetParam())));
+TEST_P(convolution_random_smoke_test, u8s8f32_scale) {
+ convolution_scale_random_test_u8s8f32 test;
+ ASSERT_NO_FATAL_FAILURE(test.run_random(GetParam()));
}
INSTANTIATE_TEST_CASE_P(
- basic_asymm,
- convolution_asymm_random_test_s8s8f32,
- testing::Combine(
- testing::Values(1, 2), // batch
- testing::Values(3, 32), // input features
- testing::Values(16, 32), // output features
- testing::Values(std::pair<size_t, size_t>(7, 7), std::pair<size_t, size_t>(8, 8)), // input x, y
- testing::Values(std::pair<size_t, size_t>(1, 1), std::pair<size_t, size_t>(3, 3)), // filter x, y
- testing::Values(std::pair<int, int>(1, 1), std::pair<int, int>(2, 2)), // strides x, y
- testing::Values(std::pair<int, int>(0, 0)), // offsets x, y
- testing::Values(std::pair<int, int>(1, 1)), // dilation x, y
- testing::Values(false, true), // bias
- testing::Values(format::bfyx, format::b_fs_yx_fsv32) // input format
+ basic,
+ convolution_random_smoke_test,
+ testing::ValuesIn(
+ params_generator()
+ .smoke_test_params(format::b_fs_yx_fsv4)
+ .smoke_test_params(format::bfyx)
+ .smoke_test_params(format::b_fs_yx_fsv32)
+ .smoke_test_params(format::b_fs_yx_fsv32, true, true)
+ .smoke_test_params(format::b_fs_yx_fsv32, false, true)
+ .smoke_test_params(format::b_fs_yx_fsv32, true, false)
+ .smoke_test_params(format::b_fs_yx_fsv16)
),
- to_string_convolution_random_params<convolution_random_test_params>);
+ to_string_convolution_all_params
+);
+
+class convolution_random_all_test : public testing::TestWithParam<convolution_random_test_all_params> {};
+
+TEST_P(convolution_random_all_test, u8s8f32) {
+ convolution_random_test_u8s8f32 test;
+ ASSERT_NO_FATAL_FAILURE(test.run_random(GetParam()));
+}
+
+TEST_P(convolution_random_all_test, s8s8f32) {
+ convolution_random_test_s8s8f32 test;
+ ASSERT_NO_FATAL_FAILURE(test.run_random(GetParam()));
+}
+
+TEST_P(convolution_random_all_test, u8s8f32_scale) {
+ convolution_scale_random_test_u8s8f32 test;
+ ASSERT_NO_FATAL_FAILURE(test.run_random(GetParam()));
+}
+
+TEST_P(convolution_random_all_test, s8s8f32_scale) {
+ convolution_scale_random_test_s8s8f32 test;
+ ASSERT_NO_FATAL_FAILURE(test.run_random(GetParam()));
+}
INSTANTIATE_TEST_CASE_P(
- basic_asymm,
- convolution_asymm_random_test_u8s8f32,
- testing::Combine(
- testing::Values(1, 2), // batch
- testing::Values(3, 32), // input features
- testing::Values(16, 32), // output features
- testing::Values(std::pair<size_t, size_t>(7, 7), std::pair<size_t, size_t>(8, 8)), // input x, y
- testing::Values(std::pair<size_t, size_t>(1, 1), std::pair<size_t, size_t>(3, 3)), // filter x, y
- testing::Values(std::pair<int, int>(1, 1), std::pair<int, int>(2, 2)), // strides x, y
- testing::Values(std::pair<int, int>(0, 0)), // offsets x, y
- testing::Values(std::pair<int, int>(1, 1)), // dilation x, y
- testing::Values(false, true), // bias
- testing::Values(format::bfyx, format::b_fs_yx_fsv32) // input format
+ DISABLED_basic,
+ convolution_random_all_test,
+ testing::ValuesIn(
+ params_generator()
+ .all_test_params(format::bfyx)
+ .all_test_params(format::bfyx, true, true)
+ .all_test_params(format::bfyx, false, true)
+ .all_test_params(format::bfyx, true, false)
+ .all_test_params(format::b_fs_yx_fsv4)
+ // byxf_af32 - depthwise broken for batch > 1
+ // .smoke_test_params(format::byxf_af32)
+ .all_test_params(format::b_fs_yx_fsv32)
+ .all_test_params(format::b_fs_yx_fsv32, true, true)
+ .all_test_params(format::b_fs_yx_fsv32, false, true)
+ .all_test_params(format::b_fs_yx_fsv32, true, false)
+ .all_test_params(format::b_fs_yx_fsv16)
+ .add(convolution_random_test_all_params{
+ 1, 89, 3, { 1, 1 }, { 3, 3 }, { 1, 1 }, { -1, -1 }, { 1, 1 }, true, 1, format::b_fs_yx_fsv4, false, false })
),
- to_string_convolution_random_params<convolution_random_test_params>);
+ to_string_convolution_all_params
+);
class convolution_test : public tests::generic_test
{
#include "api/memory.hpp"
#include <api/input_layout.hpp>
#include "api/deconvolution.hpp"
+#include "api/crop.hpp"
#include <api/data.hpp>
#include <api/topology.hpp>
#include <api/network.hpp>
}
+TEST(deconvolution_f16_gpu, basic_k9x9_s2x2_pad4x4) {
+ // Filter : 1x32x9x9
+ // Input : 1x32x16x16
+ // Stride : 2x2
+ // Pad : 4x4
+
+ //const auto& engine = get_test_engine();
+ engine engine;
+
+ VVVVF<FLOAT16> input_rnd = generate_random_4d<FLOAT16>(1, 32, 16, 16, -2, 2);
+ VF<FLOAT16> input_rnd_vec = flatten_4d<FLOAT16>(format::bfyx, input_rnd);
+ VVVVF<FLOAT16> filter_rnd = generate_random_4d<FLOAT16>(1, 32, 9, 9, -1, 1);
+ VF<FLOAT16> filter_rnd_vec = flatten_4d<FLOAT16>(format::bfyx, filter_rnd);
+ VF<FLOAT16> bias_rnd = generate_random_1d<FLOAT16>(1, -1, 1);
+ VF<float> filter_rnd_f32_vec, bias_f32_rnd;
+
+ for (unsigned int i = 0; i < filter_rnd_vec.size(); i++)
+ filter_rnd_f32_vec.push_back(float(filter_rnd_vec[i]));
+
+ for (unsigned int i = 0; i < bias_rnd.size(); i++)
+ bias_f32_rnd.push_back(float(bias_rnd[i]));
+
+ auto input = memory::allocate(engine, { data_types::f16, format::bfyx, { 1, 32, 16, 16 } });
+ auto weights = memory::allocate(engine, { data_types::f16, format::oiyx, { 1, 32, 9, 9 } });
+ auto biases = memory::allocate(engine, { data_types::f16, format::bfyx, { 1, 1, 1, 1 } });
+ auto weights_f32 = memory::allocate(engine, { data_types::f32, format::oiyx, { 1, 32, 9, 9 } });
+ auto biases_f32 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });
+
+ set_values(input, input_rnd_vec);
+ set_values(weights, filter_rnd_vec);
+ set_values(biases, bias_rnd);
+ set_values(weights_f32, filter_rnd_f32_vec);
+ set_values(biases_f32, bias_f32_rnd);
+
+ topology topology_ref(
+ input_layout("input", input.get_layout()),
+ data("weights", weights),
+ data("biases", biases),
+ deconvolution("deconv", "input", { "weights" }, { "biases" }, { 1, 1, 2, 2 }, { 0, 0, -4, -4 }, tensor{ 1, 1, 32, 32 })
+ );
+
+ network network_ref(engine, topology_ref);
+ network_ref.set_input_data("input", input);
+
+ auto outputs_ref = network_ref.execute();
+ EXPECT_EQ(outputs_ref.size(), size_t(1));
+ EXPECT_EQ(outputs_ref.begin()->first, "deconv");
+ auto output_ref_prim = outputs_ref.begin()->second.get_memory();
+ auto output_ref_ptr = output_ref_prim.pointer<FLOAT16>();
+
+ std::vector<FLOAT16> output_vec_ref;
+ for (unsigned int i = 0; i < output_ref_prim.get_layout().count(); i++)
+ {
+ output_vec_ref.push_back(output_ref_ptr[i]);
+ }
+
+ topology topology_act(
+ input_layout("input_act", input.get_layout()),
+ data("weights_f32", weights_f32),
+ data("biases_f32", biases_f32),
+ deconvolution("deconv_act", "input_act", { "weights_f32" }, { "biases_f32" }, { 1, 1, 2, 2 }, { 0, 0, -4, -4 }),
+ reorder("out", "deconv_act", format::bfyx, data_types::f16)
+ );
+
+ cldnn::build_options options;
+ options.set_option(cldnn::build_option::optimize_data(true));
+ network network_act(engine, topology_act, options);
+ network_act.set_input_data("input_act", input);
+
+ auto outputs_act = network_act.execute();
+ EXPECT_EQ(outputs_act.size(), size_t(1));
+ EXPECT_EQ(outputs_act.begin()->first, "out");
+ auto output_act_prim = outputs_act.begin()->second.get_memory();
+ auto output_act_ptr = output_act_prim.pointer<FLOAT16>();
+
+ std::vector<float> output_vec;
+ for (unsigned int i = 0; i < output_act_prim.get_layout().count(); i++)
+ {
+ float x = float_round(output_act_ptr[i]), y = float_round(output_vec_ref[i]);
+ EXPECT_NEAR(x, y, 1e-0f);
+ }
+}
+
TEST(deconvolution_f32_fw_gpu, basic_wsiz2x2_in2x2x1x2_b_fs_yx_fsv16_stride2_pad1) {
// Filter : 2x2
// Input : 2x2x1x2
#include <api/memory.hpp>
#include <api/depth_to_space.hpp>
#include <api/topology.hpp>
+#include <api/reshape.hpp>
#include <api/network.hpp>
+#include "api/permute.hpp"
+#include "api/reorder.hpp"
#include <cstddef>
#include <tests/test_utils/test_utils.h>
}
}
-TEST(depth_to_space_fp32_gpu, d1421_bs2) {
- // Input : 1x4x2x1
+TEST(depth_to_space_fp32_gpu, d112960540_bs2) {
+ // Input : 1x12x960x540
// Block size : 2
- // Output : 1x1x4x2
- // Input values in fp32
+ // Output : 1x3x1920x1080
+ // Input values in fp16
engine engine;
- auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 4, 1, 2 } });
+ auto input1 = memory::allocate(engine, { data_types::f16, format::bfyx, { 1, 12, 960, 540 } });
size_t block_size = 2;
- set_values(input1, {
- 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f
- });
+ auto random_input = generate_random_4d<FLOAT16>(1, 12, 540, 960, -1, 1);
+ auto input_rnd_vec = flatten_4d<FLOAT16>(format::bfyx, random_input);
+ set_values(input1, input_rnd_vec);
- topology topology;
- topology.add(input_layout("Input0", input1.get_layout()));
- topology.add(
+ topology topology_act;
+ topology_act.add(input_layout("Input0", input1.get_layout()));
+ topology_act.add(
depth_to_space("depth_to_space", "Input0", block_size)
);
- network network(engine, topology);
+ network network_act(engine, topology_act);
- network.set_input_data("Input0", input1);
+ network_act.set_input_data("Input0", input1);
- auto outputs = network.execute();
+ auto outputs = network_act.execute();
auto output = outputs.at("depth_to_space").get_memory();
- auto output_ptr = output.pointer<float>();
+ auto output_ptr = output.pointer<FLOAT16>();
- std::vector<float> expected_results = {
- 0.f, 2.f, 4.f, 6.f, 1.f, 3.f, 5.f, 7.f
- };
+ std::vector<uint16_t> perm = { 0,4,5,2,1,3 };
- for (size_t i = 0; i < expected_results.size(); ++i) {
- EXPECT_EQ(expected_results[i], output_ptr[i]);
+ topology topology_ref;
+ topology_ref.add(input_layout("Input0", input1.get_layout()));
+ topology_ref.add(reorder("reorder1", "Input0", { data_types::f16, format::bfwzyx, tensor{ batch(1), feature(12), spatial(1, 1, 960, 540) }
+ }));
+ topology_ref.add(
+ reshape("reshape", "reorder1", tensor{ batch(1), feature(2), spatial(960, 540, 3, 2) })
+ );
+ topology_ref.add(
+ permute("perm", "reshape", perm)
+ );
+ topology_ref.add(
+ reshape("reshape2", "perm", tensor(1, 3, 2 * 960, 2 * 540))
+ );
+
+ build_options build_opt;
+
+ build_opt.set_option(build_option::optimize_data(true));
+ network network_ref(engine, topology_ref, build_opt);
+ network_ref.set_input_data("Input0", input1);
+
+ auto outputs_ref = network_ref.execute();
+
+ auto output_ref = outputs_ref.at("reshape2").get_memory();
+ auto output_ptr_ref = output_ref.pointer<FLOAT16>();
+
+ for (size_t i = 0; i < output.get_layout().count(); ++i) {
+ EXPECT_EQ(output_ptr_ref[i], output_ptr[i]);
}
}
static std::vector<std::vector<tensor>> inputs = {
{{1, 2, 3, 4}, {1, 2, 3, 4}},
{{1, 16, 8, 2}, {1, 16, 8, 2}},
+ {{1, 128, 16, 8}, {1, 1, 16, 8}},
{{1, 32, 2, 2}, {1, 32, 2, 2}},
{{8, 32, 4, 5}, {8, 32, 4, 5}},
{{1, 2, 3, 4}, {1, 2, 1, 1}},
#include <api/engine.hpp>
#include "test_utils/test_utils.h"
#include <api/data.hpp>
+#include <api/depth_to_space.hpp>
#include <api_extension/fused_conv_eltwise.hpp>
EXPECT_EQ(out_layout.size.spatial[1], 5);
}
+TEST(fused_conv_eltwise, basic_image2d)
+{
+ const auto& engine = get_test_engine();
+
+ auto input = memory::allocate(engine, { data_types::f16, format::bfyx, { 1, 4, 128, 2 } });
+ auto input2 = memory::allocate(engine, { data_types::f16, format::bfyx, { 1, 3, 256, 4 } });
+ auto weights = memory::allocate(engine, { data_types::f16, format::bfyx, { 12, 4, 1, 1 } });
+
+ auto input_data1 = generate_random_4d<FLOAT16>(1, 4, 2, 128, -1, 1);
+ auto input_data1_bfyx = flatten_4d(format::bfyx, input_data1);
+ set_values(input, input_data1_bfyx);
+
+ auto input_data2 = generate_random_4d<FLOAT16>(1, 3, 4, 256, -1, 1);
+ auto input_data2_bfyx = flatten_4d(format::bfyx, input_data2);
+ set_values(input2, input_data2_bfyx);
+
+ auto weights_data= generate_random_4d<FLOAT16>(12, 4, 1, 1, -1, 1);
+ auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);
+ set_values(weights, weights_data_bfyx);
+
+ topology topology_act(
+ input_layout("input", input.get_layout()),
+ input_layout("input2", input2.get_layout()),
+ data("weights", weights),
+ convolution("conv", "input", { "weights" }),
+ depth_to_space("depth_to_space", "conv", 2),
+ eltwise("eltwise", "input2", "depth_to_space", eltwise_mode::sum)
+ );
+
+ build_options opt_act;
+ opt_act.set_option(build_option::optimize_data(true));
+ network network_act(engine, topology_act, opt_act);
+ network_act.set_input_data("input", input);
+ network_act.set_input_data("input2", input2);
+
+ auto outputs_act = network_act.execute();
+ EXPECT_EQ(outputs_act.size(), size_t(1));
+ EXPECT_EQ(outputs_act.begin()->first, "eltwise");
+
+ auto output_act = outputs_act.begin()->second.get_memory();
+ auto&& out_act_layout = output_act.get_layout();
+ auto out_act_ptr = output_act.pointer<uint8_t>();
+
+ topology topology_ref(
+ input_layout("input", input.get_layout()),
+ input_layout("input2", input2.get_layout()),
+ data("weights", weights),
+ convolution("conv", "input", { "weights" }),
+ depth_to_space("depth_to_space", "conv", 2),
+ eltwise("eltwise", "input2", "depth_to_space", eltwise_mode::sum),
+ reorder("out", "eltwise", format::image_2d_rgba, data_types::u8));
+
+ build_options opt_ref;
+ opt_ref.set_option(build_option::optimize_data(false));
+ network network_ref(engine, topology_ref, opt_ref);
+ network_ref.set_input_data("input", input);
+ network_ref.set_input_data("input2", input2);
+
+ auto outputs_ref = network_ref.execute();
+ EXPECT_EQ(outputs_ref.size(), size_t(1));
+ EXPECT_EQ(outputs_ref.begin()->first, "out");
+
+ auto output_ref = outputs_ref.begin()->second.get_memory();
+ auto&& out_ref_layout = output_ref.get_layout();
+ auto out_ref_ptr = output_ref.pointer<uint8_t>();
+
+ for (int i = 0;i < 3 * 256 * 4;i++) {
+ EXPECT_EQ(out_act_ptr[i], out_ref_ptr[i]);
+ }
+}
+
TEST(fused_conv_eltwise, dont_fuse_if_conv_elt_are_outputs)
{
const auto& engine = get_test_engine();
class BaseFusingTest : public ::testing::TestWithParam<T> {
public:
cldnn::engine engine;
- cldnn::topology topology;
+ cldnn::topology topology_fused;
+ cldnn::topology topology_non_fused;
cldnn::build_options bo_fused;
cldnn::build_options bo_not_fused;
size_t count = 0;
for (auto& pi : net.get_primitives_info()) {
if (pi.type_id == "reorder") {
- count++;
+ auto exec_prims = net.get_executed_primitives();
+ auto it = std::find_if(exec_prims.begin(), exec_prims.end(), [&](const std::pair<primitive_id, event>& e) -> bool {
+ return e.first == pi.original_id;
+ });
+ // We count executed reorders only
+ if (it != exec_prims.end())
+ count++;
}
}
return count;
layout get_single_element_layout(T& p) {
return layout{ p.default_type, p.default_format, tensor{1, 1, 1, 1} };
}
+
+ template <class... Args>
+ void create_topologies(Args const&... args) {
+ topology_fused.add(args...);
+ topology_non_fused.add(args...);
+ }
};
class WeightsPrimitiveFusingTest : public ::BaseFusingTest<bc_test_params> {
void execute(bc_test_params& p) {
auto input_prim = get_mem(get_input_layout(p));
- network network_not_fused(this->engine, this->topology, bo_not_fused);
- network network_fused(this->engine, this->topology, bo_fused);
+ network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
+ network network_fused(this->engine, this->topology_fused, bo_fused);
network_fused.set_input_data("input", input_prim);
network_not_fused.set_input_data("input", input_prim);
void execute(resample_test_params& p) {
auto input_prim = get_mem(get_input_layout(p));
- network network_not_fused(this->engine, this->topology, bo_not_fused);
- network network_fused(this->engine, this->topology, bo_fused);
+ network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
+ network network_fused(this->engine, this->topology_fused, bo_fused);
network_fused.set_input_data("input", input_prim);
network_not_fused.set_input_data("input", input_prim);
auto input0_prim = get_mem(get_input_layout(p, 0));
auto input1_prim = get_mem(get_input_layout(p, 1));
- network network_not_fused(this->engine, this->topology, bo_not_fused);
- network network_fused(this->engine, this->topology, bo_fused);
+ network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
+ network network_fused(this->engine, this->topology_fused, bo_fused);
network_fused.set_input_data("input0", input0_prim);
network_not_fused.set_input_data("input0", input0_prim);
network_fused.set_input_data("input1", input1_prim);
#define CASE_CONV_U8S8_4 {1, 17, 4, 5}, {1, 17, 4, 5}, {1, 1, 3, 3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, tensor{1}, 17, data_types::u8, format::bfyx, data_types::i8, format::goiyx, data_types::f32, format::bfyx
#define CASE_CONV_U8S8_5 {1, 16, 5, 5}, {1, 32, 5, 5}, {1, 1, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx
#define CASE_CONV_U8S8_6 {1, 17, 4, 5}, {1, 17, 4, 5}, {1, 1, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 17, data_types::u8, format::bfyx, data_types::i8, format::goiyx, data_types::f32, format::bfyx
+#define CASE_CONV_U8S8_7 {1, 64, 7, 7}, {1, 32, 7, 7}, {1, 1, 3, 3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, tensor{1}, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx
#define CASE_CONV_S8S8_1 {1, 15, 4, 5}, {1, 30, 2, 3}, {1, 1, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx
#define CASE_CONV_S8S8_2 {1, 15, 5, 5}, {1, 30, 3, 3}, {1, 1, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx
#define CASE_CONV_S8S8_4 {1, 17, 4, 5}, {1, 17, 4, 5}, {1, 1, 3, 3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, tensor{1}, 17, data_types::i8, format::bfyx, data_types::i8, format::goiyx, data_types::f32, format::bfyx
#define CASE_CONV_S8S8_5 {1, 16, 5, 5}, {1, 32, 5, 5}, {1, 1, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx
#define CASE_CONV_S8S8_6 {1, 17, 4, 5}, {1, 17, 4, 5}, {1, 1, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 17, data_types::i8, format::bfyx, data_types::i8, format::goiyx, data_types::f32, format::bfyx
+#define CASE_CONV_S8S8_7 {1, 64, 7, 7}, {1, 32, 7, 7}, {1, 1, 3, 3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, tensor{1}, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx
#define CASE_CONV3D_U8S8_1 {1, 15, 5, 4, 5}, {1, 30, 3, 2, 3}, {1, 1, 3, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_CONV3D_U8S8_2 {1, 15, 5, 5, 5}, {1, 30, 3, 3, 3}, {1, 1, 3, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
class conv_fp32_activation : public WeightsPrimitiveFusingTest {};
TEST_P(conv_fp32_activation, basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
convolution("conv_prim", "input", {"weights"}, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
class conv_fp32_scale : public WeightsPrimitiveFusingTest {};
TEST_P(conv_fp32_scale, basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())),
class conv_fp32_prelu_eltwise : public WeightsPrimitiveFusingTest {};
TEST_P(conv_fp32_prelu_eltwise, basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("slope_data", get_mem(get_per_channel_layout(p))),
TEST_P(conv_fp32_prelu_eltwise, vector_ops) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("slope_data", get_mem(get_per_channel_layout(p))),
TEST_P(conv_fp32_prelu_eltwise, vector_ops_mixed_types) {
auto p = GetParam();
auto slope_type = p.default_type == data_types::f32 ? data_types::f16 : data_types::f32;
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("slope_data", get_mem(layout{ slope_type, p.default_format, tensor{1, p.out_shape.feature[0], 1, 1} })),
TEST_P(conv_fp32_eltwise_b_fs_zyx_fsv16, vector_ops) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("eltwise_data", get_mem(get_output_layout(p))),
class conv_fp32_swish : public WeightsPrimitiveFusingTest {};
TEST_P(conv_fp32_swish, basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
convolution("conv_prim", "input", {"weights"}, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
std::vector<std::string> weights_idx;
for (size_t w = 0; w < p.groups; w++) {
- topology.add(data("weights" + std::to_string(w), get_mem(get_weights_layout(p, p.groups))));
+ create_topologies(data("weights" + std::to_string(w), get_mem(get_weights_layout(p, p.groups))));
weights_idx.push_back(("weights" + std::to_string(w)));
}
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("eltwise_data", get_mem(get_output_layout(p))),
convolution("conv_prim", "input", weights_idx, {}, 1, p.stride, p.pad, p.dilation),
eltwise("eltwise", "conv_prim", "eltwise_data", eltwise_mode::sum),
class conv_fp32_quantize_u8 : public WeightsPrimitiveFusingTest {};
TEST_P(conv_fp32_quantize_u8, DISABLED_basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
class conv_fp32_scale_quantize_i8 : public WeightsPrimitiveFusingTest {};
TEST_P(conv_fp32_scale_quantize_i8, DISABLED_basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
class conv_fp32_scale_activation_quantize_i8 : public WeightsPrimitiveFusingTest {};
TEST_P(conv_fp32_scale_activation_quantize_i8, DISABLED_basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
class conv_fp32_scale_activation_quantize_i8_eltwise_fp32 : public WeightsPrimitiveFusingTest {};
TEST_P(conv_fp32_scale_activation_quantize_i8_eltwise_fp32, DISABLED_basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
class conv_fp32_scale_activation_quantize_i8_activation : public WeightsPrimitiveFusingTest {};
TEST_P(conv_fp32_scale_activation_quantize_i8_activation, DISABLED_basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
class conv_fp32_scale_activation_quantize_i8_eltwise_fp32_quantize_i8 : public WeightsPrimitiveFusingTest {};
TEST_P(conv_fp32_scale_activation_quantize_i8_eltwise_fp32_quantize_i8, DISABLED_basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
class conv_bin_activation : public WeightsPrimitiveFusingTest {};
TEST_P(conv_bin_activation, basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p), -127, 127)),
binary_convolution("bin_conv_prim", "input", {"weights"}, p.stride, p.pad, p.dilation, p.out_shape, p.groups),
activation("activation", "bin_conv_prim", activation_func::relu),
class conv_bin_scale_activation : public WeightsPrimitiveFusingTest {};
TEST_P(conv_bin_scale_activation, basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p), -127, 127)),
data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())),
binary_convolution("bin_conv_prim", "input", {"weights"}, p.stride, p.pad, p.dilation, p.out_shape, p.groups),
TEST_P(conv_bin_quantize_bin, channel_wise_quantize) {
auto p = GetParam();
auto in_thresh = get_mem(get_per_channel_layout(p), min_random, max_random);
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p), -127, 127)),
data("in_lo", in_thresh),
data("in_hi", in_thresh),
TEST_P(conv_bin_quantize_bin, blob_wise_quantize) {
auto p = GetParam();
auto in_thresh = get_mem(get_single_element_layout(p), min_random, max_random);
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p), -127, 127)),
data("in_lo", in_thresh),
data("in_hi", in_thresh),
auto dw_weights_layout = layout{p.default_type, format::goiyx, dw_tensor};
auto dw_stride = tensor{1, 1, 2, 2};
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p), -127, 127)),
data("weights_dw", get_mem(dw_weights_layout, -127, 127)),
data("scale_data", get_mem(get_per_channel_layout(p), 1e-1f)),
auto dw_weights_layout = layout{p.default_type, format::goiyx, dw_tensor};
auto dw_stride = tensor{1, 1, 1, 1};
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p), -127, 127)),
data("weights_dw", get_mem(dw_weights_layout, -127, 127)),
data("scale_data", get_mem(get_per_channel_layout(p), 1e-1f)),
auto dw_stride = tensor{1, 1, 2, 2};
auto in_thresh = get_mem(get_per_channel_layout(p), min_random, max_random);
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p), -127, 127)),
data("weights_dw", get_mem(dw_weights_layout, -127, 127)),
data("scale_data", get_mem(get_per_channel_layout(p), 1e-1f)),
auto dw_stride = tensor{1, 1, 1, 1};
auto in_thresh = get_mem(get_per_channel_layout(p), min_random, max_random);
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p), -127, 127)),
data("weights_dw", get_mem(dw_weights_layout, -127, 127)),
data("scale_data", get_mem(get_per_channel_layout(p), 1e-1f)),
class conv_int8_scale : public WeightsPrimitiveFusingTest {};
TEST_P(conv_int8_scale, basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())),
class conv_int8_scale_shift_swish : public WeightsPrimitiveFusingTest {};
TEST_P(conv_int8_scale_shift_swish, basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())),
class conv_int8_byxf_af32 : public WeightsPrimitiveFusingTest {};
TEST_P(conv_int8_byxf_af32, per_channel_coeffs) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)),
TEST_P(conv_int8_byxf_af32, per_element_coeffs) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("eltwise_data", get_mem(get_output_layout(p))),
class conv_int8_prelu_eltwise : public WeightsPrimitiveFusingTest {};
TEST_P(conv_int8_prelu_eltwise, basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("slope_data", get_mem(get_per_channel_layout(p))),
execute(p);
}
+TEST_P(conv_int8_prelu_eltwise, fsv16) {
+ auto p = GetParam();
+ create_topologies(input_layout("input", get_input_layout(p)),
+ data("weights", get_mem(get_weights_layout(p))),
+ data("bias", get_mem(get_bias_layout(p))),
+ data("slope_data", get_mem(get_per_channel_layout(p))),
+ data("eltwise_data", get_mem(get_output_layout(p))),
+ convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
+ activation("activation", "conv_prim", "slope_data", activation_func::relu_negative_slope),
+ eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::sum),
+ reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32)
+ );
+
+ if (p.default_format.dimension() == 4) {
+ implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" };
+ bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} }));
+ } else {
+ // TODO Add 5D int8 optimized convolution implementations
+ return;
+ }
+
+ tolerance = 1e-5f;
+ execute(p);
+}
+
INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_prelu_eltwise,
::testing::ValuesIn(std::vector<bc_test_params>{
bc_test_params{CASE_CONV_U8S8_1, 2, 4},
bc_test_params{CASE_CONV_U8S8_2, 2, 4},
bc_test_params{CASE_CONV_U8S8_3, 2, 4},
bc_test_params{CASE_CONV_U8S8_4, 2, 4},
+ bc_test_params{CASE_CONV_U8S8_7, 2, 4},
bc_test_params{CASE_CONV_S8S8_1, 2, 4},
bc_test_params{CASE_CONV_S8S8_2, 2, 4},
bc_test_params{CASE_CONV_S8S8_3, 2, 4},
bc_test_params{CASE_CONV_S8S8_4, 2, 4},
+ bc_test_params{CASE_CONV_S8S8_7, 2, 4},
bc_test_params{CASE_CONV3D_U8S8_1, 2, 4},
bc_test_params{CASE_CONV3D_U8S8_2, 2, 4},
class conv_int8_quantize_u8 : public WeightsPrimitiveFusingTest {};
TEST_P(conv_int8_quantize_u8, per_channel) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
TEST_P(conv_int8_quantize_u8, per_tensor) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("in_lo", get_mem(get_single_element_layout(p), -10)),
class conv_int8_scale_quantize_i8 : public WeightsPrimitiveFusingTest {};
TEST_P(conv_int8_scale_quantize_i8, basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
class conv_int8_scale_activation_quantize_i8 : public WeightsPrimitiveFusingTest {};
TEST_P(conv_int8_scale_activation_quantize_i8, basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
class conv_int8_scale_activation_quantize_i8_eltwise_fp32 : public WeightsPrimitiveFusingTest {};
TEST_P(conv_int8_scale_activation_quantize_i8_eltwise_fp32, basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
class conv_int8_scale_activation_quantize_i8_activation : public WeightsPrimitiveFusingTest {};
TEST_P(conv_int8_scale_activation_quantize_i8_activation, basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
class conv_int8_scale_activation_quantize_i8_eltwise_fp32_quantize_i8 : public WeightsPrimitiveFusingTest {};
TEST_P(conv_int8_scale_activation_quantize_i8_eltwise_fp32_quantize_i8, basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
class conv_int8_scale_prelu_quantize_i8_eltwise_fp32_quantize_i8_vec : public WeightsPrimitiveFusingTest {};
TEST_P(conv_int8_scale_prelu_quantize_i8_eltwise_fp32_quantize_i8_vec, vector_ops) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
TEST_P(conv_int8_scale_prelu_quantize_i8_eltwise_fp32_quantize_i8_vec, vector_ops_mixed_types) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
auto weights_format = (p.weights_format == format::goiyx) ? format::bfyx : format::bfzyx;
auto weights_layout = (p.groups > 1) ? get_weights_layout(p, 1, weights_format) :
get_weights_layout(p);
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(weights_layout)),
data("bias", get_mem(get_bias_layout(p))),
data("w_zp", get_mem(get_weights_zp_layout(p), 1, 127)),
tolerance = 1.f;
auto input_prim = get_mem(get_input_layout(p));
- network network_not_fused(this->engine, this->topology, bo_not_fused);
- network network_fused(this->engine, this->topology, bo_fused);
+ network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
+ network network_fused(this->engine, this->topology_fused, bo_fused);
network_fused.set_input_data("input", input_prim);
network_not_fused.set_input_data("input", input_prim);
ASSERT_FALSE(network_fused.get_primitives_info().empty());
ASSERT_FALSE(network_not_fused.get_primitives_info().empty());
+ // Search for both conv_prim and reorder_bfyx, as in case of fused topology convolution will be merged with the last reorder
auto find_conv = [](primitive_info& p) -> bool {
- if (p.original_id == "conv_prim")
+ if (p.original_id == "conv_prim" || p.original_id == "reorder_bfyx")
return true;
return false;
};
auto weights_format = (p.weights_format == format::goiyx) ? format::bfyx : format::bfzyx;
auto weights_layout = (p.groups > 1) ? get_weights_layout(p, 1, weights_format) :
get_weights_layout(p);
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(weights_layout)),
data("bias", get_mem(get_bias_layout(p))),
data("a_zp", get_mem(get_activations_zp_layout(p), 1, 127)),
tolerance = 1.f;
auto input_prim = get_mem(get_input_layout(p));
- network network_not_fused(this->engine, this->topology, bo_not_fused);
- network network_fused(this->engine, this->topology, bo_fused);
+ network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
+ network network_fused(this->engine, this->topology_fused, bo_fused);
network_fused.set_input_data("input", input_prim);
network_not_fused.set_input_data("input", input_prim);
ASSERT_FALSE(network_fused.get_primitives_info().empty());
ASSERT_FALSE(network_not_fused.get_primitives_info().empty());
+ // Search for both conv_prim and reorder_bfyx, as in case of fused topology convolution will be merged with the last reorder
auto find_conv = [](primitive_info& p) -> bool {
- if (p.original_id == "conv_prim")
+ if (p.original_id == "conv_prim" || p.original_id == "reorder_bfyx")
return true;
return false;
};
auto weights_format = (p.weights_format == format::goiyx) ? format::bfyx : format::bfzyx;
auto weights_layout = (p.groups > 1) ? get_weights_layout(p, 1, weights_format) :
get_weights_layout(p);
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(weights_layout)),
data("bias", get_mem(get_bias_layout(p))),
data("a_zp", get_mem(get_activations_zp_layout(p), 1, 127)),
tolerance = 1.f;
auto input_prim = get_mem(get_input_layout(p));
- network network_not_fused(this->engine, this->topology, bo_not_fused);
- network network_fused(this->engine, this->topology, bo_fused);
+ network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
+ network network_fused(this->engine, this->topology_fused, bo_fused);
network_fused.set_input_data("input", input_prim);
network_not_fused.set_input_data("input", input_prim);
ASSERT_FALSE(network_fused.get_primitives_info().empty());
ASSERT_FALSE(network_not_fused.get_primitives_info().empty());
+ // Search for both conv_prim and reorder_bfyx, as in case of fused topology convolution will be merged with the last reorder
auto find_conv = [](primitive_info& p) -> bool {
- if (p.original_id == "conv_prim")
+ if (p.original_id == "conv_prim" || p.original_id == "reorder_bfyx")
return true;
return false;
};
class fc_fp32_activation : public WeightsPrimitiveFusingTest {};
TEST_P(fc_fp32_activation, basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
fully_connected("fc_prim", "input", "weights", "bias"),
class fc_int8_scale : public WeightsPrimitiveFusingTest {};
TEST_P(fc_int8_scale, basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count())),
class fc_int8_quantize_u8 : public WeightsPrimitiveFusingTest {};
TEST_P(fc_int8_quantize_u8, basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
class fc_int8_scale_quantize_i8 : public WeightsPrimitiveFusingTest {};
TEST_P(fc_int8_scale_quantize_i8, basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
class fc_int8_scale_activation_quantize_i8 : public WeightsPrimitiveFusingTest {};
TEST_P(fc_int8_scale_activation_quantize_i8, basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
class gemm_int8_3in_quantize_i8 : public GemmFusingTest {};
TEST_P(gemm_int8_3in_quantize_i8, basic) {
auto p = GetParam();
- topology.add(input_layout("input0", get_input_layout(p, 0)),
+ create_topologies(input_layout("input0", get_input_layout(p, 0)),
input_layout("input1", get_input_layout(p, 1)),
input_layout("input2", get_input_layout(p, 2)),
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
class gemm_int8_2in_quantize_u8 : public GemmFusingTest {};
TEST_P(gemm_int8_2in_quantize_u8, basic) {
auto p = GetParam();
- topology.add(input_layout("input0", get_input_layout(p, 0)),
+ create_topologies(input_layout("input0", get_input_layout(p, 0)),
input_layout("input1", get_input_layout(p, 1)),
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
class gemm_int8_2in_act_scale_quantize_i8 : public GemmFusingTest {};
TEST_P(gemm_int8_2in_act_scale_quantize_i8, basic) {
auto p = GetParam();
- topology.add(input_layout("input0", get_input_layout(p, 0)),
+ create_topologies(input_layout("input0", get_input_layout(p, 0)),
input_layout("input1", get_input_layout(p, 1)),
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
class resample_quantize : public ResamplePrimitiveFusingTest {};
TEST_P(resample_quantize, basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
data("out_lo", get_mem(get_single_element_layout(p), -127)),
class resample_scale_activation : public ResamplePrimitiveFusingTest {};
TEST_P(resample_scale_activation, basic) {
auto p = GetParam();
- topology.add(input_layout("input", get_input_layout(p)),
+ create_topologies(input_layout("input", get_input_layout(p)),
data("scale_data", get_mem(get_per_channel_layout(p), -10, 10)),
resample("resample_prim", "input", p.out_shape, p.in_shape.feature[0], p.type),
scale("scale", "resample_prim", "scale_data"),
#define CASE_MVN_3D_F16_2 {2, 16, 8, 8, 8}, data_types::f16, format::bfzyx, true, true, data_types::f16, format::bfzyx
#define CASE_MVN_I8_1 {1, 16, 8, 8}, data_types::i8, format::bfyx, false, true, data_types::f32, format::bfyx
#define CASE_MVN_I8_2 {2, 16, 8, 8}, data_types::i8, format::bfyx, true, true, data_types::f32, format::bfyx
+#define CASE_MVN_I8_3 {1, 16, 8, 8}, data_types::i8, format::b_fs_yx_fsv16, false, true, data_types::f32, format::bfyx
+#define CASE_MVN_I8_4 {2, 16, 8, 8}, data_types::i8, format::b_fs_yx_fsv16, true, true, data_types::f32, format::bfyx
#define CASE_MVN_3D_I8_1 {1, 16, 8, 8, 8}, data_types::i8, format::bfzyx, false, true, data_types::f32, format::bfzyx
#define CASE_MVN_3D_I8_2 {2, 16, 8, 8, 8}, data_types::i8, format::bfzyx, true, true, data_types::f32, format::bfzyx
+#define CASE_MVN_U8_1 {1, 16, 8, 8}, data_types::u8, format::bfyx, false, true, data_types::f32, format::bfyx
+#define CASE_MVN_U8_2 {2, 16, 8, 8}, data_types::u8, format::bfyx, true, true, data_types::f32, format::bfyx
+#define CASE_MVN_U8_3 {1, 16, 8, 8}, data_types::u8, format::b_fs_yx_fsv16, false, true, data_types::f32, format::bfyx
+#define CASE_MVN_U8_4 {2, 16, 8, 8}, data_types::u8, format::b_fs_yx_fsv16, true, true, data_types::f32, format::bfyx
+#define CASE_MVN_3D_U8_1 {1, 16, 8, 8, 8}, data_types::u8, format::bfzyx, false, true, data_types::f32, format::bfzyx
+#define CASE_MVN_3D_U8_2 {2, 16, 8, 8, 8}, data_types::u8, format::bfzyx, true, true, data_types::f32, format::bfzyx
class MVNFusingTest : public ::BaseFusingTest<mvn_test_params> {
public:
void execute(mvn_test_params& p) {
auto input_prim = get_mem(get_input_layout(p));
- network network_not_fused(this->engine, this->topology, bo_not_fused);
- network network_fused(this->engine, this->topology, bo_fused);
+ network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
+ network network_fused(this->engine, this->topology_fused, bo_fused);
network_fused.set_input_data("input", input_prim);
network_not_fused.set_input_data("input", input_prim);
class mvn_activation : public MVNFusingTest {};
TEST_P(mvn_activation, basic) {
auto p = GetParam();
- topology.add(
+ create_topologies(
input_layout("input", get_input_layout(p)),
mvn("mvn", "input", false, p.normalize_variance),
activation("act", "mvn", activation_func::hyperbolic_tan),
mvn_test_params{ CASE_MVN_3D_F16_2, 2, 3 },
mvn_test_params{ CASE_MVN_I8_1, 2, 3 },
mvn_test_params{ CASE_MVN_I8_2, 2, 3 },
+ mvn_test_params{ CASE_MVN_I8_3, 2, 3 },
+ mvn_test_params{ CASE_MVN_I8_4, 2, 3 },
mvn_test_params{ CASE_MVN_3D_I8_1, 2, 3 },
mvn_test_params{ CASE_MVN_3D_I8_2, 2, 3 },
+ mvn_test_params{ CASE_MVN_U8_1, 2, 3 },
+ mvn_test_params{ CASE_MVN_U8_2, 2, 3 },
+ mvn_test_params{ CASE_MVN_U8_3, 2, 3 },
+ mvn_test_params{ CASE_MVN_U8_4, 2, 3 },
+ mvn_test_params{ CASE_MVN_3D_U8_1, 2, 3 },
+ mvn_test_params{ CASE_MVN_3D_U8_2, 2, 3 },
}), );
class mvn_scale_quantize_i8 : public MVNFusingTest {};
TEST_P(mvn_scale_quantize_i8, basic) {
auto p = GetParam();
- topology.add(
+ create_topologies(
input_layout("input", get_input_layout(p)),
mvn("mvn", "input", false, p.normalize_variance),
data("scale_data", get_mem(get_per_channel_layout(p))),
// mvn_test_params{ CASE_MVN_3D_F16_2, 2, 4 },
mvn_test_params{ CASE_MVN_I8_1, 2, 4 },
mvn_test_params{ CASE_MVN_I8_2, 2, 4 },
+ mvn_test_params{ CASE_MVN_I8_3, 2, 4 },
+ mvn_test_params{ CASE_MVN_I8_4, 2, 4 },
mvn_test_params{ CASE_MVN_3D_I8_1, 2, 4 },
mvn_test_params{ CASE_MVN_3D_I8_2, 2, 4 },
+ mvn_test_params{ CASE_MVN_U8_1, 2, 4 },
+ mvn_test_params{ CASE_MVN_U8_2, 2, 4 },
+ mvn_test_params{ CASE_MVN_U8_3, 2, 4 },
+ mvn_test_params{ CASE_MVN_U8_4, 2, 4 },
+ mvn_test_params{ CASE_MVN_3D_U8_1, 2, 4 },
+ mvn_test_params{ CASE_MVN_3D_U8_2, 2, 4 },
}), );
class mvn_scale_activation_quantize_i8_eltwise_fp32_quantize_i8 : public MVNFusingTest {};
TEST_P(mvn_scale_activation_quantize_i8_eltwise_fp32_quantize_i8, basic) {
auto p = GetParam();
- topology.add(
+ create_topologies(
input_layout("input", get_input_layout(p)),
mvn("mvn", "input", false, p.normalize_variance),
data("scale_data", get_mem(get_per_channel_layout(p))),
// mvn_test_params{ CASE_MVN_3D_F16_2, 2, 7 },
mvn_test_params{ CASE_MVN_I8_1, 2, 7 },
mvn_test_params{ CASE_MVN_I8_2, 2, 7 },
+ mvn_test_params{ CASE_MVN_I8_3, 2, 7 },
+ mvn_test_params{ CASE_MVN_I8_4, 2, 7 },
mvn_test_params{ CASE_MVN_3D_I8_1, 2, 7 },
mvn_test_params{ CASE_MVN_3D_I8_2, 2, 7 },
+ mvn_test_params{ CASE_MVN_U8_1, 2, 7 },
+ mvn_test_params{ CASE_MVN_U8_2, 2, 7 },
+ mvn_test_params{ CASE_MVN_U8_3, 2, 7 },
+ mvn_test_params{ CASE_MVN_U8_4, 2, 7 },
+ mvn_test_params{ CASE_MVN_3D_U8_1, 2, 7 },
+ mvn_test_params{ CASE_MVN_3D_U8_2, 2, 7 },
}), );
void execute(pooling_test_params& p) {
auto input_prim = get_mem(get_input_layout(p));
- network network_not_fused(this->engine, this->topology, bo_not_fused);
- network network_fused(this->engine, this->topology, bo_fused);
+ network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
+ network network_fused(this->engine, this->topology_fused, bo_fused);
network_fused.set_input_data("input", input_prim);
network_not_fused.set_input_data("input", input_prim);
class pooling_activation : public PoolingFusingTest {};
TEST_P(pooling_activation, basic) {
auto p = GetParam();
- topology.add(
+ create_topologies(
input_layout("input", get_input_layout(p)),
pooling("pooling", "input", p.mode, p.kernel_size, p.stride, p.offset),
activation("act", "pooling", activation_func::relu),
class pooling_scale : public PoolingFusingTest {};
TEST_P(pooling_scale, basic) {
auto p = GetParam();
- topology.add(
+ create_topologies(
input_layout("input", get_input_layout(p)),
data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel_size.count())),
pooling("pooling", "input", p.mode, p.kernel_size, p.stride, p.offset),
};
template <typename T>
-void mvn_compute_mean_accross_channels_bfyx(cldnn::memory &output, bool normalize_variance)
-{
- using namespace tests;
-
- const auto output_desc = generic_test::get_linear_memory_desc(output.get_layout());
+void mvn_compute_mean_accross_channels(cldnn::memory &output, bool normalize_variance) {
+ auto output_size = output.get_layout().size;
- auto output_sizes = output.get_layout().size.sizes();
-
- uint32_t batch_size = output_sizes[0];
- uint32_t feature_size = output_sizes[1];
- uint32_t y_size = output_sizes[3];
- uint32_t x_size = output_sizes[2];
+ uint32_t batch_size = output_size.batch[0];
+ uint32_t feature_size = output_size.feature[0];
+ uint32_t y_size = output_size.spatial[1];
+ uint32_t x_size = output_size.spatial[0];
auto buff = output.pointer<T>();
{
for (uint32_t x = 0; x < x_size; ++x)
{
- size_t data_index = generic_test::get_linear_index(output.get_layout(), b, f, y, x, output_desc);
+ auto index_tensor = tensor(batch(b), feature(f), spatial(x, y, 0, 0));
+ size_t data_index = output.get_layout().get_linear_offset(index_tensor);
float data = static_cast<float>(buff[data_index]);
sum += data;
if (normalize_variance)
}
sum /= feature_size * y_size * x_size;
T result_sum = static_cast<T>(sum);
- EXPECT_NEAR(result_sum, 0.f, err_margin);
+ EXPECT_NEAR(result_sum, 0.f, err_margin) << "at b=" << b;
if (normalize_variance)
{
variance /= feature_size * y_size * x_size;
T result_variance = static_cast<T>(variance);
- EXPECT_NEAR(result_variance, 1.f, err_margin);
+ EXPECT_NEAR(result_variance, 1.f, err_margin) << " at b=" << b;
}
}
}
template <typename T>
-void mvn_compute_mean_within_channels_bfyx(cldnn::memory &output, bool normalize_variance)
-{
- using namespace tests;
+void mvn_compute_mean_within_channels(cldnn::memory &output, bool normalize_variance) {
+ auto output_size = output.get_layout().size;
- const auto output_desc = generic_test::get_linear_memory_desc(output.get_layout());
-
- auto output_sizes = output.get_layout().size.sizes();
-
- uint32_t batch_size = output_sizes[0];
- uint32_t feature_size = output_sizes[1];
- uint32_t y_size = output_sizes[3];
- uint32_t x_size = output_sizes[2];
+ uint32_t batch_size = output_size.batch[0];
+ uint32_t feature_size = output_size.feature[0];
+ uint32_t y_size = output_size.spatial[1];
+ uint32_t x_size = output_size.spatial[0];
auto buff = output.pointer<T>();
{
for (uint32_t x = 0; x < x_size; ++x)
{
- size_t data_index = generic_test::get_linear_index(output.get_layout(), b, f, y, x, output_desc);
+ auto index_tensor = tensor(batch(b), feature(f), spatial(x, y, 0, 0));
+ size_t data_index = output.get_layout().get_linear_offset(index_tensor);
float data = static_cast<float>(buff[data_index]);
sum += data;
if (normalize_variance)
}
sum /= y_size * x_size;
T result_sum = static_cast<T>(sum);
- EXPECT_NEAR(result_sum, 0.f, err_margin);
+ EXPECT_NEAR(result_sum, 0.f, err_margin) << "at b=" << b << ", f=" << f;
if (normalize_variance)
{
variance /= y_size * x_size;
T result_variance = static_cast<T>(variance);
- EXPECT_NEAR(result_variance, 1.f, err_margin);
+ EXPECT_NEAR(result_variance, 1.f, err_margin) << " at b=" << b << ", f=" << f;
}
}
}
EXPECT_EQ(outputs.begin()->first, "mvn");
auto output = outputs.begin()->second.get_memory();
- mvn_compute_mean_accross_channels_bfyx<float>(output, false);
+ mvn_compute_mean_accross_channels<float>(output, false);
}
TEST(mvn_gpu_test, mvn_test_across_channels_bfyx_fp16)
EXPECT_EQ(outputs.begin()->first, "mvn");
auto output = outputs.begin()->second.get_memory();
- mvn_compute_mean_accross_channels_bfyx<FLOAT16>(output, false);
+ mvn_compute_mean_accross_channels<FLOAT16>(output, false);
}
TEST(mvn_gpu_test, mvn_test_across_channels_bfyx_normalize_variance)
EXPECT_EQ(outputs.begin()->first, "mvn");
auto output = outputs.begin()->second.get_memory();
- mvn_compute_mean_accross_channels_bfyx<float>(output, true);
+ mvn_compute_mean_accross_channels<float>(output, true);
}
TEST(mvn_gpu_test, mvn_test_across_channels_bfyx_normalize_variance_fp16)
EXPECT_EQ(outputs.begin()->first, "mvn");
auto output = outputs.begin()->second.get_memory();
- mvn_compute_mean_accross_channels_bfyx<FLOAT16>(output, true);
+ mvn_compute_mean_accross_channels<FLOAT16>(output, true);
}
TEST(mvn_gpu_test, mvn_test_within_channels_bfyx)
EXPECT_EQ(outputs.begin()->first, "mvn");
auto output = outputs.begin()->second.get_memory();
- mvn_compute_mean_within_channels_bfyx<float>(output, false);
+ mvn_compute_mean_within_channels<float>(output, false);
}
TEST(mvn_gpu_test, mvn_test_within_channels_bfyx_fp16)
EXPECT_EQ(outputs.begin()->first, "mvn");
auto output = outputs.begin()->second.get_memory();
- mvn_compute_mean_within_channels_bfyx<FLOAT16>(output, false);
+ mvn_compute_mean_within_channels<FLOAT16>(output, false);
}
TEST(mvn_gpu_test, mvn_test_within_channels_bfyx_normalize_variance)
EXPECT_EQ(outputs.begin()->first, "mvn");
auto output = outputs.begin()->second.get_memory();
- mvn_compute_mean_within_channels_bfyx<float>(output, true);
+ mvn_compute_mean_within_channels<float>(output, true);
}
TEST(mvn_gpu_test, mvn_test_within_channels_bfyx_normalize_variance_fp16)
EXPECT_EQ(outputs.begin()->first, "mvn");
auto output = outputs.begin()->second.get_memory();
- mvn_compute_mean_within_channels_bfyx<FLOAT16>(output, true);
+ mvn_compute_mean_within_channels<FLOAT16>(output, true);
}
+
+struct mvn_basic_test_params {
+ format::type input_format;
+ data_types input_type;
+ tensor input_size;
+ bool across_channels;
+ bool normalize_variance;
+ padding output_pad;
+};
+
+struct mvn_random_test : ::testing::TestWithParam<mvn_basic_test_params> {
+ template <typename T>
+ void fill_data(memory& mem, const tests::VVVVF<T>& data) {
+ auto size = mem.get_layout().size;
+ auto ptr = mem.pointer<T>();
+ for (size_t bi = 0; bi < static_cast<size_t>(size.batch[0]); ++bi) {
+ for (size_t fi = 0; fi < static_cast<size_t>(size.feature[0]); ++fi) {
+ for (size_t yi = 0; yi < static_cast<size_t>(size.spatial[1]); ++yi) {
+ for (size_t xi = 0; xi < static_cast<size_t>(size.spatial[0]); ++xi) {
+ auto tensor_addr = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
+ auto offset = mem.get_layout().get_linear_offset(tensor_addr);
+ ptr[offset] = data[bi][fi][xi][yi];
+ }
+ }
+ }
+ }
+ }
+
+ template <typename T>
+ void fill_random_data(memory& mem, int min, int max, int k = 8) {
+ auto size = mem.get_layout().size;
+ auto input_data = tests::generate_random_4d<T>(size.batch[0], size.feature[0], size.spatial[0], size.spatial[1], min, max, k);
+ fill_data(mem, input_data);
+ }
+
+ void check_result(memory& output, bool across_channels, bool normalize_variance) {
+ if (output.get_layout().data_type == data_types::f32) {
+ if (across_channels) {
+ mvn_compute_mean_accross_channels<float>(output, normalize_variance);
+ } else {
+ mvn_compute_mean_within_channels<float>(output, normalize_variance);
+ }
+ } else if (output.get_layout().data_type == data_types::f16) {
+ if (across_channels) {
+ mvn_compute_mean_accross_channels<FLOAT16>(output, normalize_variance);
+ } else {
+ mvn_compute_mean_within_channels<FLOAT16>(output, normalize_variance);
+ }
+ }
+ }
+
+ void execute(const mvn_basic_test_params& params, const engine& eng) {
+ auto& size = params.input_size;
+ auto& output_pad = params.output_pad;
+
+ auto input = memory::allocate(eng, { params.input_type, params.input_format, size });
+
+ switch (params.input_type) {
+ case data_types::f32:
+ fill_random_data<float>(input, -127, 127);
+ break;
+ case data_types::f16:
+ fill_random_data<FLOAT16>(input, -127, 127);
+ break;
+ case data_types::i8:
+ fill_random_data<int8_t>(input, -127, 127);
+ break;
+ case data_types::u8:
+ fill_random_data<uint8_t>(input, -127, 127);
+ break;
+ default:
+ break;
+ }
+
+ topology topo;
+ topo.add(input_layout("input", input.get_layout()));
+ auto prim = mvn("mvn", "input", params.across_channels, params.normalize_variance);
+ prim.output_padding = output_pad;
+ topo.add(prim);
+
+ network net(eng, topo);
+
+ net.set_input_data("input", input);
+
+ auto outputs = net.execute();
+ EXPECT_EQ(outputs.size(), size_t(1));
+ EXPECT_EQ(outputs.begin()->first, "mvn");
+
+ auto output = outputs.begin()->second.get_memory();
+ check_result(output, params.across_channels, params.normalize_variance);
+ }
+};
+
+TEST_P(mvn_random_test, random) {
+ auto eng = tests::get_test_engine();
+ this->execute(GetParam(), eng);
+}
+
+struct mvn_test_case_generator : std::vector<mvn_basic_test_params> {
+ mvn_test_case_generator& add(mvn_basic_test_params params) {
+ push_back(params);
+ return *this;
+ }
+
+ mvn_test_case_generator& smoke_tests(format::type fmt, data_types in_dt) {
+ push_back(mvn_basic_test_params{ fmt, in_dt, {7, 10, 17, 13}, false, false, padding() });
+ push_back(mvn_basic_test_params{ fmt, in_dt, {7, 10, 17, 13}, false, true, padding() });
+ push_back(mvn_basic_test_params{ fmt, in_dt, {7, 10, 17, 13}, true, false, padding() });
+ push_back(mvn_basic_test_params{ fmt, in_dt, {7, 10, 17, 13}, true, true, padding() });
+ return *this;
+ }
+
+ mvn_test_case_generator& extended_tests(format::type fmt, data_types in_dt) {
+ push_back(mvn_basic_test_params{ fmt, in_dt, {2, 17, 67, 71}, false, false, padding() });
+ push_back(mvn_basic_test_params{ fmt, in_dt, {2, 17, 67, 71}, false, true, padding() });
+ push_back(mvn_basic_test_params{ fmt, in_dt, {2, 17, 67, 71}, true, false, padding() });
+ push_back(mvn_basic_test_params{ fmt, in_dt, {2, 17, 67, 71}, true, true, padding() });
+ // output padding
+ push_back(mvn_basic_test_params{ fmt, in_dt, {2, 17, 67, 71}, false, false, padding({0, 0, 1, 1}) });
+ push_back(mvn_basic_test_params{ fmt, in_dt, {2, 17, 67, 71}, false, true, padding({0, 0, 1, 1}) });
+ push_back(mvn_basic_test_params{ fmt, in_dt, {2, 17, 67, 71}, true, false, padding({0, 0, 1, 1}) });
+ push_back(mvn_basic_test_params{ fmt, in_dt, {2, 17, 67, 71}, true, true, padding({0, 0, 1, 1}) });
+
+ return *this;
+ }
+};
+
+INSTANTIATE_TEST_CASE_P(smoke,
+ mvn_random_test,
+ testing::ValuesIn(
+ mvn_test_case_generator()
+ .smoke_tests(format::b_fs_yx_fsv16, data_types::i8)
+ .smoke_tests(format::b_fs_yx_fsv16, data_types::u8)
+ ), );
+
+INSTANTIATE_TEST_CASE_P(extended,
+ mvn_random_test,
+ testing::ValuesIn(
+ mvn_test_case_generator()
+ .extended_tests(format::b_fs_yx_fsv16, data_types::i8)
+ .extended_tests(format::b_fs_yx_fsv16, data_types::u8)
+ ), );
EXPECT_FLOAT_EQ(out_ptr[1], -0.02f);
}
-TEST(reorder_gpu_opt, basic_do_not_remove_redundant_due_it_is_output)
-{
- engine eng;
-
- memory in = memory::allocate(eng, { data_types::f32, format::yxfb, tensor{ 1, 2, 2, 1 } });
- memory weights = memory::allocate(eng, { data_types::f32, format::bfyx, tensor{ 1, 2, 2, 1 } });
- topology tpl{
- input_layout("in", in.get_layout()),
- convolution("conv", "in", { "weights" }),
- data("weights", weights),
- reorder("r1", "conv", format::bfyx, data_types::f32) //reoder is output - do not optimize
- };
-
- build_options opts;
- opts.set_option(build_option::optimize_data(true));
-
- network net(eng, tpl, opts);
- net.set_input_data("in", in);
- auto outputs = net.execute();
- auto executed_primitives = net.get_executed_primitives();
-
- //all pirmitives in this test needs to be executed
- EXPECT_TRUE(executed_primitives.count("conv") == 1);
- EXPECT_TRUE(executed_primitives.count("in") == 1);
- EXPECT_TRUE(executed_primitives.count("r1") == 1);
- ASSERT_TRUE(outputs.count("r1") == 1);
- EXPECT_TRUE(outputs.at("r1").get_memory().get_layout().format == format::bfyx);
-}
-
TEST(reorder_gpu_opt, basic_remove_redundant_output_due_to_implicit_reorders)
{
engine eng;
}
}
+TEST(reorder_image2d_rgba_to_bfyx_gpu, basic)
+{
+ const auto& engine = get_test_engine();
+
+ auto input = memory::allocate(engine, { data_types::u8, format::image_2d_rgba, { 1, 3, 2, 2 } });
+ layout output_layout(data_types::f16, format::bfyx, { 1, 3, 2, 2 });
+
+ set_values<unsigned char>(input, {
+ 1, 0, 5, 7,
+ 2, 111, 123, 8,
+ 124, 125, 50, 9,
+ 251, 252, 253, 210
+ });
+
+ topology topology(
+ input_layout("input", input.get_layout()),
+ reorder("reorder", "input", output_layout));
+
+ network network(engine, topology);
+ network.set_input_data("input", input);
+
+ auto outputs = network.execute();
+ EXPECT_EQ(outputs.size(), size_t(1));
+ EXPECT_EQ(outputs.begin()->first, "reorder");
+
+ auto output = outputs.begin()->second.get_memory();
+
+ float answers[12] = {
+ 1.0f, 2.0f,
+ 124.0f, 251.0f,
+
+ 0.0f, 111.0f,
+ 125.0f, 252.0f,
+
+ 5.0f, 123.0f,
+ 50.0f, 253.0f,
+ };
+
+ auto output_ptr = output.pointer<FLOAT16>();
+ for (int i = 0; i < 12; i++)
+ {
+ EXPECT_NEAR(FLOAT16(answers[i] / 255.f), output_ptr[i], 1e-3f);
+ }
+
+}
+
+TEST(reorder_bfyx_to_image2d_rgba_gpu, basic)
+{
+ const auto& engine = get_test_engine();
+
+ auto input = memory::allocate(engine, { data_types::f16, format::bfyx, { 1, 3, 2, 2 } });
+ layout output_layout(data_types::u8, format::image_2d_rgba, { 1, 3, 2, 2 });
+
+ set_values<FLOAT16>(input, {
+ FLOAT16(1.0f / 255.f), FLOAT16(2.0f / 255.f),
+ FLOAT16(124.0f / 255.f), FLOAT16(251.0f / 255.f),
+
+ FLOAT16(0.0f / 255.f), FLOAT16(111.0f / 255.f),
+ FLOAT16(125.0f / 255.f), FLOAT16(252.0f / 255.f),
+
+ FLOAT16(5.0f / 255.f), FLOAT16(123.0f / 255.f),
+ FLOAT16(50.0f / 255.f), FLOAT16(253.0f / 255.f),
+ });
+
+ topology topology(
+ input_layout("input", input.get_layout()),
+ reorder("reorder", "input", output_layout));
+
+ network network(engine, topology);
+ network.set_input_data("input", input);
+
+ auto outputs = network.execute();
+ EXPECT_EQ(outputs.size(), size_t(1));
+ EXPECT_EQ(outputs.begin()->first, "reorder");
+
+ auto output = outputs.begin()->second.get_memory();
+
+ unsigned char answers[16] = {
+ 1, 0, 5, 0,
+ 2, 111, 123, 0,
+ 124, 125, 50, 0,
+ 251, 252, 253, 0
+ };
+
+ auto output_ptr = output.pointer<unsigned char>();
+ for (int i = 0; i < 16; i++)
+ {
+ EXPECT_EQ(answers[i], output_ptr[i]);
+ }
+
+}
+
using namespace cldnn;
class reorder_test : public tests::generic_test
}
}
}
+
+struct resample_random_test_params {
+ data_types input_type;
+ tensor input_size;
+ tensor output_size;
+ uint32_t num_filter;
+ resample_type operation_type;
+ format::type in_format;
+ format::type out_format;
+};
+
+struct resample_random_test : testing::TestWithParam<resample_random_test_params>{
+ template <typename T>
+ void fill_random_typed(memory& mem, int min, int max) {
+ auto size = mem.get_layout().size;
+ size_t b = size.batch[0];
+ size_t f = size.feature[0];
+ size_t x = size.spatial[0];
+ size_t y = size.spatial[1];
+
+ auto data = generate_random_4d<T>(b, f, y, x, min, max);
+ auto ptr = mem.pointer<T>();
+ for (size_t bi = 0; bi < b; ++bi) {
+ for (size_t fi = 0; fi < f; ++fi) {
+ for (size_t yi = 0; yi < y; ++yi) {
+ for (size_t xi = 0; xi < x; ++xi) {
+ auto coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
+ auto offset = mem.get_layout().get_linear_offset(coords);
+ ptr[offset] = data[bi][fi][yi][xi];
+ }
+ }
+ }
+ }
+ }
+
+ void fill_random(memory& mem) {
+ auto dt = mem.get_layout().data_type;
+ switch (dt) {
+ case data_types::f32:
+ fill_random_typed<float>(mem, -127, 127);
+ break;
+ case data_types::f16:
+ fill_random_typed<FLOAT16>(mem, -127, 127);
+ break;
+ case data_types::i8:
+ fill_random_typed<int8_t>(mem, -127, 127);
+ break;
+ case data_types::u8:
+ fill_random_typed<uint8_t>(mem, 0, 255);
+ break;
+ default:
+ break;
+ }
+ }
+
+ template <typename T>
+ void compare_nearest_typed(const memory& input, const memory& output) {
+ auto output_lay = output.get_layout();
+ size_t b = output_lay.size.batch[0];
+ size_t f = output_lay.size.feature[0];
+ size_t x = output_lay.size.spatial[0];
+ size_t y = output_lay.size.spatial[1];
+ float x_ratio = static_cast<float>(input.get_layout().size.spatial[0]) / static_cast<float>(x);
+ float y_ratio = static_cast<float>(input.get_layout().size.spatial[1]) / static_cast<float>(y);
+
+ auto in_ptr = input.pointer<T>();
+ auto out_ptr = output.pointer<T>();
+ for (size_t bi = 0; bi < b; ++bi) {
+ for (size_t fi = 0; fi < f; ++fi) {
+ for (size_t yi = 0; yi < y; ++yi) {
+ for (size_t xi = 0; xi < x; ++xi) {
+ auto in_xi = static_cast<size_t>(floor(x_ratio * xi));
+ auto in_yi = static_cast<size_t>(floor(y_ratio * yi));
+ auto in_coords = tensor(batch(bi), feature(fi), spatial(in_xi, in_yi, 0, 0));
+ auto in_offset = input.get_layout().get_linear_offset(in_coords);
+ auto in_val = in_ptr[in_offset];
+ auto out_coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
+ auto out_offset = output.get_layout().get_linear_offset(out_coords);
+ auto out_val = out_ptr[out_offset];
+ EXPECT_EQ(in_val, out_val) << " at bi=" << bi << ", fi=" << fi << ", xi=" << xi << ", yi=" << yi;
+ }
+ }
+ }
+ }
+ }
+
+ void compare(const memory& input, const memory& output, resample_type operation) {
+ auto dt = output.get_layout().data_type;
+ if (operation == resample_type::nearest) {
+ if (dt == data_types::f32) {
+ compare_nearest_typed<float>(input, output);
+ } else if (dt == data_types::f16) {
+ compare_nearest_typed<FLOAT16>(input, output);
+ } else if (dt == data_types::i8) {
+ compare_nearest_typed<int8_t>(input, output);
+ } else if (dt == data_types::u8) {
+ compare_nearest_typed<uint8_t>(input, output);
+ } else {
+ FAIL() << "Not supported data type: " << static_cast<size_t>(dt);
+ }
+ } else {
+ FAIL() << "Not supported resample_type: " << static_cast<int32_t>(operation);
+ }
+ }
+
+ void execute(const resample_random_test_params& params) {
+ auto eng = get_test_engine();
+
+ auto in_layout = layout(params.input_type, params.in_format, params.input_size);
+
+ auto topo = topology(
+ input_layout("in", in_layout),
+ resample("resample", "in", params.output_size, params.num_filter, params.operation_type)
+ );
+
+ auto build_opts = build_options(
+ build_option::force_implementations({ {"resample", {params.out_format, ""}} })
+ );
+ auto net = network(eng, topo, build_opts);
+
+ auto in_mem = memory::allocate(eng, in_layout);
+ fill_random(in_mem);
+ net.set_input_data("in", in_mem);
+
+ auto result = net.execute();
+ auto output = result.at("resample").get_memory();
+
+ compare(in_mem, output, params.operation_type);
+ }
+};
+
+TEST_P(resample_random_test, random) {
+ execute(GetParam());
+}
+
+struct resample_random_test_param_generator : std::vector<resample_random_test_params> {
+ resample_random_test_param_generator& add(resample_random_test_params params) {
+ push_back(params);
+ return *this;
+ }
+
+ resample_random_test_param_generator& smoke_params(data_types type, format::type input_format, format::type output_format) {
+ push_back(resample_random_test_params{ type, {1, 17, 5, 9}, {1, 17, 15, 18}, 1, resample_type::nearest, input_format, output_format });
+ push_back(resample_random_test_params{ type, {2, 17, 5, 9}, {2, 17, 15, 18}, 1, resample_type::nearest, input_format, output_format });
+ push_back(resample_random_test_params{ type, {1, 7, 10, 17}, {1, 7, 21, 35}, 1, resample_type::nearest, input_format, output_format });
+ push_back(resample_random_test_params{ type, {2, 7, 10, 17}, {2, 7, 21, 35}, 1, resample_type::nearest, input_format, output_format });
+ return *this;
+ }
+
+};
+
+INSTANTIATE_TEST_CASE_P(smoke,
+ resample_random_test,
+ testing::ValuesIn(
+ resample_random_test_param_generator()
+ .smoke_params(data_types::i8, format::byxf_af32, format::byxf_af32)
+ .smoke_params(data_types::u8, format::byxf_af32, format::byxf_af32)
+ .smoke_params(data_types::i8, format::b_fs_yx_fsv4, format::b_fs_yx_fsv4)
+ .smoke_params(data_types::u8, format::b_fs_yx_fsv4, format::b_fs_yx_fsv4)
+ .smoke_params(data_types::i8, format::b_fs_yx_fsv16, format::b_fs_yx_fsv16)
+ .smoke_params(data_types::u8, format::b_fs_yx_fsv16, format::b_fs_yx_fsv16)
+ ), );
EXPECT_TRUE(are_equal(answers[i], output_ptr[i]));
}
}
+
+TEST(strided_slice_gpu_f32, test_2x2x2x2_full_negative_stride) {
+ // Input (BFYX): 2x2x2x2
+ // Begin (BFYX): 0x0x0x0
+ // End (BFYX): 2x2x2x2
+ // Stride (BFYX): -1x1x1x1
+ // Output (BFYX): 2x2x2x2
+
+ const auto& engine = get_test_engine();
+ auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } });
+ auto begin = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } });
+ auto end = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } });
+ auto strides = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } });
+
+ set_values(input, {
+ 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
+ 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f
+ });
+ set_values(begin, {
+ 0, 0, 0, 0
+ });
+ set_values(end, {
+ 2, 2, 2, 2
+ });
+ set_values(strides, {
+ -1, -1, 1, 1
+ });
+
+ topology topology;
+ topology.add(input_layout("input", input.get_layout()));
+ topology.add(data("input2", begin));
+ topology.add(data("input3", end));
+ topology.add(data("input4", strides));
+ topology.add(strided_slice("strided_slice", "input", "input2", "input3", "input4", {}, {}, {}, {}, {}));
+
+ network network(engine, topology);
+
+ network.set_input_data("input", input);
+
+ auto outputs = network.execute();
+
+ EXPECT_EQ(outputs.size(), size_t(1));
+ EXPECT_EQ(outputs.begin()->first, "strided_slice");
+
+ auto output = outputs.at("strided_slice").get_memory();
+
+ std::vector<float> answers = {
+ 12.f, 13.f, 14.f, 15.f, 8.f, 9.f, 10.f, 11.f, 4.f, 5.f, 6.f, 7.f, 0.f, 1.f, 2.f, 3.f };
+
+ auto output_ptr = output.pointer<float>();
+
+ ASSERT_EQ(output_ptr.size(), answers.size());
+ for (size_t i = 0; i < answers.size(); ++i)
+ {
+ EXPECT_TRUE(are_equal(answers[i], output_ptr[i]));
+ }
+}
+
+TEST(strided_slice_gpu_f32, test_2x2x2x1x1_2_negative_all) {
+ // Input (BFZYX): 2x2x2x1x1
+ // Output (BFZYX): 2x1x1x1x1
+
+ const auto& engine = get_test_engine();
+ auto input = memory::allocate(engine, { data_types::f32, format::bfzyx, { 2, 2, 1, 1, 2 } });
+ auto begin = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 1, 1, 1 } });
+ auto end = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 1, 1, 1 } });
+ auto strides = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 1, 1, 1 } });
+
+ set_values(input, {
+ 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f
+ });
+ set_values(begin, {
+ 0, 0, 0
+ });
+ set_values(end, {
+ 2, 2, 2
+ });
+ set_values(strides, {
+ 1, 2, 2
+ });
+
+ topology topology;
+ topology.add(input_layout("input", input.get_layout()));
+ topology.add(data("input2", begin));
+ topology.add(data("input3", end));
+ topology.add(data("input4", strides));
+ topology.add(strided_slice("strided_slice", "input", "input2", "input3", "input4", {}, {}, {}, {}));
+
+ network network(engine, topology);
+
+ network.set_input_data("input", input);
+
+ auto outputs = network.execute();
+
+ EXPECT_EQ(outputs.size(), size_t(1));
+ EXPECT_EQ(outputs.begin()->first, "strided_slice");
+
+ auto output = outputs.at("strided_slice").get_memory();
+
+ std::vector<float> answers = {
+ 0.0f, 4.0f
+ };
+
+ auto output_ptr = output.pointer<float>();
+
+ for (size_t i = 0; i < answers.size(); ++i)
+ {
+ EXPECT_TRUE(are_equal(answers[i], output_ptr[i]));
+ }
+}
INSTANCE(ref_convolution_bwd_data_t<f32, f32, f32, f32>),
#ifdef ENABLE_UNUSED_PRIM
INSTANCE(ref_convolution_bwd_weights_t<f32, f32, f32, f32>),
+#endif
/* conv (bfloat16) */
INSTANCE(_jit_uni_dw_convolution_fwd_t<avx512_core, bf16, bf16>),
INSTANCE(_jit_uni_dw_convolution_fwd_t<avx512_core, bf16, f32>),
+#ifdef ENABLE_UNUSED_PRIM
INSTANCE(_jit_uni_dw_convolution_bwd_data_t<avx512_core, bf16, bf16>),
INSTANCE(_jit_uni_dw_convolution_bwd_data_t<avx512_core, bf16, f32>),
INSTANCE(_jit_uni_dw_convolution_bwd_weights_t<avx512_core, bf16, bf16>),
INSTANCE(_jit_uni_dw_convolution_bwd_weights_t<avx512_core, bf16, f32>),
+#endif
INSTANCE(jit_avx512_core_bf16_1x1_convolution_fwd_t<f32>),
INSTANCE(jit_avx512_core_bf16_1x1_convolution_fwd_t<bf16>),
+#ifdef ENABLE_UNUSED_PRIM
INSTANCE(jit_avx512_core_bf16_1x1_convolution_bwd_data_t<f32>),
INSTANCE(jit_avx512_core_bf16_1x1_convolution_bwd_data_t<bf16>),
INSTANCE(jit_avx512_core_bf16_1x1_convolution_bwd_weights_t<f32>),
INSTANCE(jit_avx512_core_bf16_1x1_convolution_bwd_weights_t<bf16>),
+#endif
INSTANCE(jit_avx512_core_bf16_convolution_fwd_t<f32>),
INSTANCE(jit_avx512_core_bf16_convolution_fwd_t<bf16>),
+#ifdef ENABLE_UNUSED_PRIM
INSTANCE(jit_avx512_core_bf16_convolution_bwd_data_t<f32>),
INSTANCE(jit_avx512_core_bf16_convolution_bwd_data_t<bf16>),
INSTANCE(jit_avx512_core_bf16_convolution_bwd_weights_t<bf16>),
INSTANCE(jit_avx512_core_bf16_convolution_bwd_weights_t<f32>),
+#endif
INSTANCE(gemm_bf16_convolution_fwd_t<f32>),
INSTANCE(gemm_bf16_convolution_fwd_t<bf16>),
+#ifdef ENABLE_UNUSED_PRIM
INSTANCE(gemm_bf16_convolution_bwd_data_t<f32>),
INSTANCE(gemm_bf16_convolution_bwd_data_t<bf16>),
INSTANCE(gemm_bf16_convolution_bwd_weights_t<f32>),
#endif
/* eltwise */
INSTANCE(jit_uni_eltwise_fwd_t<avx512_common, f32>),
-#ifdef ENABLE_UNUSED_PRIM
INSTANCE(jit_uni_eltwise_fwd_t<avx512_common, bf16>),
+#ifdef ENABLE_UNUSED_PRIM
INSTANCE(jit_uni_eltwise_bwd_t<avx512_common, f32>),
INSTANCE(jit_uni_eltwise_bwd_t<avx512_common, bf16>),
#endif
#endif
INSTANCE(ref_eltwise_fwd_t<f32>),
-#ifdef ENABLE_UNUSED_PRIM
INSTANCE(ref_eltwise_fwd_t<bf16>),
+#ifdef ENABLE_UNUSED_PRIM
INSTANCE(ref_eltwise_bwd_t<f32>),
INSTANCE(ref_eltwise_bwd_t<bf16>),
#endif
INSTANCE(ref_softmax_fwd_t<f32>),
#ifdef ENABLE_UNUSED_PRIM
INSTANCE(ref_softmax_bwd_t<f32>),
+#endif
/* pool */
INSTANCE(jit_uni_pooling_fwd_t<avx512_common, bf16>),
+#ifdef ENABLE_UNUSED_PRIM
INSTANCE(jit_uni_pooling_bwd_t<avx512_common, bf16>),
#endif
INSTANCE(jit_uni_pooling_fwd_t<avx512_common, f32>),
INSTANCE(jit_uni_pooling_fwd_t<sse42, f32>),
#ifdef ENABLE_UNUSED_PRIM
INSTANCE(jit_uni_pooling_bwd_t<sse42, f32>),
+#endif
INSTANCE(nchw_pooling_fwd_t<bf16>),
+#ifdef ENABLE_UNUSED_PRIM
INSTANCE(nchw_pooling_bwd_t<bf16>),
#endif
INSTANCE(nchw_pooling_fwd_t<f32>),
#ifdef ENABLE_UNUSED_PRIM
INSTANCE(nchw_pooling_bwd_t<f32>),
-
+#endif
INSTANCE(nhwc_pooling_fwd_t<bf16>),
+#ifdef ENABLE_UNUSED_PRIM
INSTANCE(nhwc_pooling_bwd_t<bf16>),
#endif
INSTANCE(nhwc_pooling_fwd_t<f32>),
#endif
INSTANCE(ref_pooling_fwd_t<f32, f32>),
-#ifdef ENABLE_UNUSED_PRIM
INSTANCE(ref_pooling_fwd_t<bf16, bf16, f32>),
+#ifdef ENABLE_UNUSED_PRIM
INSTANCE(ref_pooling_bwd_t<f32, f32>),
INSTANCE(ref_pooling_bwd_t<bf16, bf16>),
#endif
INSTANCE(jit_avx512_common_lrn_fwd_t<f32>),
#ifdef ENABLE_UNUSED_PRIM
INSTANCE(jit_avx512_common_lrn_bwd_t<f32>),
+#endif
INSTANCE(jit_avx512_common_lrn_fwd_t<bf16>),
+#ifdef ENABLE_UNUSED_PRIM
INSTANCE(jit_avx512_common_lrn_bwd_t<bf16>),
#endif
INSTANCE(jit_uni_lrn_fwd_t<avx2>),
INSTANCE(ref_lrn_fwd_t<f32>),
#ifdef ENABLE_UNUSED_PRIM
INSTANCE(ref_lrn_bwd_t<f32>),
+#endif
INSTANCE(ref_lrn_fwd_t<bf16>),
+#ifdef ENABLE_UNUSED_PRIM
INSTANCE(ref_lrn_bwd_t<bf16>),
#endif
/* batch normalization */
#ifdef ENABLE_UNUSED_PRIM
INSTANCE(ref_inner_product_bwd_data_t<f32, f32, f32, f32>),
INSTANCE(ref_inner_product_bwd_weights_t<f32>),
+#endif
/* inner product (bfloat16) */
INSTANCE(gemm_bf16_inner_product_fwd_t<f32>),
INSTANCE(gemm_bf16_inner_product_fwd_t<bf16>),
+#ifdef ENABLE_UNUSED_PRIM
INSTANCE(gemm_bf16_inner_product_bwd_data_t<f32>),
INSTANCE(gemm_bf16_inner_product_bwd_data_t<bf16>),
INSTANCE(gemm_bf16_inner_product_bwd_weights_t<f32>),
balance211(work_amount, nthr, ithr, start, end);
auto par_conv = jit_conv_call_s();
- size_t src_h_stride = src_d.blk_off(0, 0, 1);
- size_t dst_h_stride = dst_d.blk_off(0, 0, 1);
+ size_t src_h_stride = src_d.blk_off(0, 0, 1) - src_d.off_l(0);
+ size_t dst_h_stride = dst_d.blk_off(0, 0, 1) - dst_d.off_l(0);
size_t wht_h_stride = wht_blk_off(weights_d, 0, 0, 0, 1);
int n{0}, g{0}, occ{0}, oh_s{0}, owb{0};
balance211(work_amount, nthr, ithr, start, end);
auto par_conv = jit_conv_call_s();
- size_t src_d_stride = src_d.blk_off(0, 0, 1);
- size_t src_h_stride = src_d.blk_off(0, 0, 0, 1);
- size_t dst_h_stride = dst_d.blk_off(0, 0, 0, 1);
+ size_t src_d_stride = src_d.blk_off(0, 0, 1) - src_d.off_l(0);
+ size_t src_h_stride = src_d.blk_off(0, 0, 0, 1) - src_d.off_l(0);
+ size_t dst_h_stride = dst_d.blk_off(0, 0, 0, 1) - dst_d.off_l(0);
size_t wht_d_stride = wht_blk_off(weights_d, 0, 0, 0, 1);
size_t wht_h_stride = wht_blk_off(weights_d, 0, 0, 0, 0, 1);
}
assert(kd_len >= 0);
- auto diff_src_w = diff_src +
+ auto diff_src_w = diff_src +
diff_src_d.blk_off(n, g_icb, id_s);
auto diff_dst_w = diff_dst + diff_dst_d.blk_off(n, g_ocb, od_s);
auto wht_w = weights + wht_blk_off(weights_d, g, 0, icb, kd_lo);
balance211(work_amount, nthr, ithr, start, end);
auto par_conv = jit_conv_call_s();
- size_t diff_src_h_stride = diff_src_d.blk_off(0, 0, 1);
- size_t diff_dst_h_stride = diff_dst_d.blk_off(0, 0, 1);
+ size_t diff_src_h_stride = diff_src_d.blk_off(0, 0, 1) - diff_src_d.off_l(0);
+ size_t diff_dst_h_stride = diff_dst_d.blk_off(0, 0, 1) - diff_dst_d.off_l(0);
size_t wht_h_stride = wht_blk_off(weights_d, 0, 0, 0, 1);
bool is_fast_path = jcp.dilate_h == 0 && jcp.stride_h == 1;
int work_rem = end - start;
int ih_e = ih_s + work_rem > jcp.ih ? jcp.ih : ih_s + work_rem;
- auto diff_src_w = diff_src +
+ auto diff_src_w = diff_src +
diff_src_d.blk_off(n, g_icb);
- auto diff_dst_w = diff_dst +
+ auto diff_dst_w = diff_dst +
diff_dst_d.blk_off(n, g_ocb);
auto wht_w = weights + wht_blk_off(weights_d, g, 0, icb);
--- /dev/null
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifndef WIN_SYNCHAPI
+#define WIN_SYNCHAPI
+
+#include "win_pthread.h"
+#include "synchapi.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _pthread_condattr_t pthread_condattr_t;
+
+typedef struct
+{
+ CONDITION_VARIABLE _cv;
+}
+pthread_cond_t;
+
+int pthread_cond_init(pthread_cond_t* __cond, const pthread_condattr_t* __cond_attr);
+int pthread_cond_destroy(pthread_cond_t* __cond);
+
+int pthread_cond_timedwait(pthread_cond_t* __cond,
+ pthread_mutex_t* __mutex,
+ const struct timespec* __abstime);
+int pthread_cond_broadcast(pthread_cond_t* __cond);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* WIN_MUTEX */
--- /dev/null
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "win_synchapi.h"
+
+int pthread_cond_init(pthread_cond_t* __cond, const pthread_condattr_t* __cond_attr)
+{
+ if (__cond == NULL) {
+ return ERROR_INVALID_HANDLE;
+ }
+
+ (void)__cond_attr;
+ InitializeConditionVariable(&__cond->_cv);
+ return 0;
+}
+
+int pthread_cond_destroy(pthread_cond_t* __cond)
+{
+ (void)__cond;
+ return 0;
+}
+
+int pthread_cond_timedwait(pthread_cond_t* __cond,
+ pthread_mutex_t* __mutex,
+ const struct timespec* __abstime)
+{
+ if (__cond == NULL) {
+ return ERROR_INVALID_HANDLE;
+ }
+
+ long long msec = INFINITE;
+ if (__abstime != NULL) {
+ msec = __abstime->tv_sec * 1000 + __abstime->tv_nsec / 1000000;
+ }
+
+ return SleepConditionVariableCS(&__cond->_cv, __mutex, (DWORD)msec);
+}
+
+int pthread_cond_broadcast(pthread_cond_t *__cond)
+{
+ if (__cond == NULL) {
+ return ERROR_INVALID_HANDLE;
+ }
+
+ WakeConditionVariable(&__cond->_cv);
+ return 0;
+}
*/
class IDevice {
public:
- using time_point = std::chrono::high_resolution_clock::time_point;
+ using time_point = std::chrono::steady_clock::time_point;
virtual ~IDevice() = default;
#include <watchdogPrivate.hpp>
#include <algorithm>
#include <memory>
+#include <string>
#include <ncCommPrivate.h>
#include <mvnc.h>
#include <ncPrivateTypes.h>
#include "XLinkPrivateDefines.h"
#include "XLinkErrorUtils.h"
+#if defined(_WIN32)
+#include "win_synchapi.h"
+#endif // defined(_WIN32)
+
namespace {
using namespace std;
*/
class XLinkDevice : public IDevice {
_devicePrivate_t privateDevice;
- using time_point = std::chrono::high_resolution_clock::time_point;
+ using time_point = std::chrono::steady_clock::time_point;
time_point lastPongTime = time_point::min();
time_point lastPingTime = time_point::min();
enum : int { deviceHangTimeout = 12000};
};
class WatchdogImpl {
- enum : uint8_t {
- STATE_IDLE = 0,
- INITIATE_THREAD_STOP = 1,
- THREAD_EXITED = 2,
- WAKE_UP_THREAD = 3,
- };
-
using wd_context_as_tuple = std::tuple<std::shared_ptr<IDevice>, bool*, void*>;
using Devices = std::list<wd_context_as_tuple>;
std::mutex devicesListAcc;
std::atomic<int> generation = {0};
std::atomic_bool threadRunning;
- volatile std::uint8_t notificationReason = STATE_IDLE;
- std::condition_variable wakeUpPingThread;
+ pthread_mutex_t routineLock;
+ pthread_cond_t wakeUpPingThread;
std::thread poolThread;
- WatchdogImpl() = default;
WatchdogImpl(const WatchdogImpl&) = delete;
WatchdogImpl(WatchdogImpl&&) = delete;
WatchdogImpl& operator = (const WatchdogImpl&) = delete;
WatchdogImpl& operator = (WatchdogImpl&&) = delete;
- public:
+
+private:
+
+ WatchdogImpl() {
+ int rc = pthread_mutex_init(&routineLock, NULL);
+ if (rc != 0) {
+ throw std::runtime_error("failed to initialize \"routineLock\" mutex. rc: " + std::to_string(rc));
+ }
+
+#if !(defined(__APPLE__) || defined(_WIN32))
+ pthread_condattr_t attr;
+ rc = pthread_condattr_init(&attr);
+ if (rc != 0) {
+ throw std::runtime_error("failed to initialize condition variable attribute. rc: " + std::to_string(rc));
+ }
+
+ rc = pthread_condattr_setclock(&attr, CLOCK_MONOTONIC);
+ if (rc != 0) {
+ throw std::runtime_error("failed to set condition variable clock. rc: " + std::to_string(rc));
+ }
+#endif // !(defined(__APPLE__) || defined(_WIN32))
+
+ rc = pthread_cond_init(&wakeUpPingThread, NULL);
+ if (rc != 0) {
+ throw std::runtime_error("failed to initialize \"wakeUpPingThread\" condition variable. rc: " + std::to_string(rc));
+ }
+ }
+
+public:
static WatchdogImpl &instance() {
static WatchdogImpl watchdog;
return watchdog;
}
+
~WatchdogImpl() {
mvLog(MVLOG_INFO, "watchdog terminated\n");
+ try
{
- auto __lock = lock();
+ lockRoutineMutex();
for (auto &item : watchedDevices) {
*std::get<1>(item) = true;
mvLog(MVLOG_WARN, "[%p] device, stop watching due to watchdog termination\n", std::get<2>(item));
}
- notificationReason = THREAD_EXITED;
+ unlockRoutineMutex();
+ } catch (const std::exception & ex) {
+ mvLog(MVLOG_ERROR, "error %s", ex.what());
+ } catch (...) {
+ mvLog(MVLOG_ERROR, "unknown error");
}
- wakeUpPingThread.notify_one();
+ threadRunning = false;
+ int rc = pthread_cond_broadcast(&wakeUpPingThread);
+ if (rc != 0) {
+ mvLog(MVLOG_WARN, "failed to unblock threads blocked on the \"wakeUpPingThread\". rc=%d", rc);
+ }
+
+ rc = pthread_mutex_destroy(&routineLock);
+ if (rc != 0) {
+ mvLog(MVLOG_WARN, "failed to destroy the \"routineLock\". rc=%d", rc);
+ }
+
+ rc = pthread_cond_destroy(&wakeUpPingThread);
+ if (rc != 0) {
+ mvLog(MVLOG_WARN, "failed to destroy the \"wakeUpPingThread\". rc=%d", rc);
+ }
if (poolThread.joinable()) {
poolThread.join();
public:
void *register_device(std::shared_ptr<IDevice> device) {
- auto __locker = lock();
+ lockRoutineMutex();
std::unique_ptr<wd_context_opaque> ctx (new wd_context_opaque);
// rare case of exact pointer address collision
});
} else {
// wake up thread
- notificationReason = WAKE_UP_THREAD;
- wakeUpPingThread.notify_one();
+ int rc = pthread_cond_broadcast(&wakeUpPingThread);
+ if (rc != 0) {
+ mvLog(MVLOG_WARN, "failed to unblock threads blocked on the \"wakeUpPingThread\". rc=%d", rc);
+ }
}
ctx->handleCached = device->getHandle();
ctx->actual = std::get<0>(watchedDevices.back()).get();
+ unlockRoutineMutex();
return ctx.release();
}
if (ptr == nullptr) {
return false;
}
- auto __locker = lock();
+ lockRoutineMutex();
// thread already removed
if (ptr->destroyed) {
delete ptr;
+ unlockRoutineMutex();
return true;
}
}
// wake up thread since we might select removed device as nex to be ping, and there is no more devices available
- notificationReason = WAKE_UP_THREAD;
- __locker.unlock();
- wakeUpPingThread.notify_one();
+ unlockRoutineMutex();
+ int rc = pthread_cond_broadcast(&wakeUpPingThread);
+ if (rc != 0) {
+ mvLog(MVLOG_WARN, "failed to unblock threads blocked on the \"wakeUpPingThread\". rc=%d", rc);
+ }
return bFound;
}
private:
- std::unique_lock<std::mutex> lock() {
- return std::unique_lock<std::mutex>(devicesListAcc);
+ void lockRoutineMutex() {
+ int rc = pthread_mutex_lock(&routineLock);
+ if (rc != 0) {
+ throw std::runtime_error("failed to lock \"routineLock\" mutex. rc: " + std::to_string(rc));
+ }
+ }
+
+ void unlockRoutineMutex() {
+ int rc = pthread_mutex_unlock(&routineLock);
+ if (rc != 0) {
+ throw std::runtime_error("failed to unlock \"routineLock\" mutex. rc: " + std::to_string(rc));
+ }
}
void watchdog_routine() noexcept {
mvLog(MVLOG_INFO, "thread started\n");
milliseconds sleepInterval;
- auto __locker = lock();
+ struct timespec timeToWait = {0, 0};
+ lockRoutineMutex();
+
do {
for (auto deviceIt = watchedDevices.begin(); deviceIt != watchedDevices.end(); ) {
auto &device = std::get<0>(*deviceIt);
- auto isReady = device->dueIn(high_resolution_clock::now()).count() == 0;
+ auto isReady = device->dueIn(steady_clock::now()).count() == 0;
if (isReady) {
auto now = high_resolution_clock::now();
- device->keepAlive(high_resolution_clock::now());
+ device->keepAlive(steady_clock::now());
mvLog(MVLOG_DEBUG, "ping completed in %ld ms\n", duration_cast<std::chrono::milliseconds>(high_resolution_clock ::now()-now).count());
}
if (device->isTimeout()) {
++deviceIt;
}
}
- auto currentTime = high_resolution_clock::now();
+ auto currentTime = steady_clock::now();
auto minInterval = std::min_element(watchedDevices.begin(),
watchedDevices.end(),
[¤tTime] (const Devices::value_type & device1, const Devices::value_type & device2) {
sleepInterval = std::get<0>(*minInterval)->dueIn(currentTime);
mvLog(MVLOG_DEBUG, "sleep interval = %ld ms\n", sleepInterval.count());
- notificationReason = STATE_IDLE;
+ auto sec = std::chrono::duration_cast<std::chrono::seconds>(sleepInterval);
- wakeUpPingThread.wait_until(__locker, currentTime + sleepInterval, [this, currentTime]() {
- mvLog(MVLOG_DEBUG,
- "waiting for %ld ms\n",
- duration_cast<std::chrono::milliseconds>(high_resolution_clock::now() - currentTime).count());
- return notificationReason != STATE_IDLE;
- });
+#if (defined(__APPLE__) || defined(_WIN32))
+ timeToWait.tv_sec = sec.count();
+ timeToWait.tv_nsec =
+ std::chrono::duration_cast<std::chrono::nanoseconds>(sleepInterval).count() -
+ std::chrono::nanoseconds(sec).count();
+#else
+ clock_gettime(CLOCK_MONOTONIC, &timeToWait);
+ timeToWait.tv_sec += sec.count();
+ timeToWait.tv_nsec +=
+ std::chrono::duration_cast<std::chrono::nanoseconds>(sleepInterval).count() -
+ std::chrono::nanoseconds(sec).count();
+#endif // (defined(__APPLE__) || defined(_WIN32))
+
+#if defined(__APPLE__)
+ pthread_cond_timedwait_relative_np(&wakeUpPingThread, &routineLock, &timeToWait);
+#else
+ pthread_cond_timedwait(&wakeUpPingThread, &routineLock, &timeToWait);
+#endif // defined(__APPLE__)
- mvLog(MVLOG_DEBUG, "waiting completed in %ld ms\n",
- duration_cast<std::chrono::milliseconds>(high_resolution_clock ::now() - currentTime).count());
- } while (notificationReason != THREAD_EXITED);
+ mvLog(MVLOG_DEBUG, "waiting completed in %ld ms\n",
+ duration_cast<std::chrono::milliseconds>(steady_clock::now() - currentTime).count());
+ } while (threadRunning);
} catch (const std::exception & ex) {
- mvLog(MVLOG_ERROR, "error %s\n", ex.what());
+ mvLog(MVLOG_ERROR, "error %s", ex.what());
} catch (...) {
- mvLog(MVLOG_ERROR, "error\n");
+ mvLog(MVLOG_ERROR, "unknown error");
}
+
+ unlockRoutineMutex();
mvLog(MVLOG_INFO, "thread ended\n");
- threadRunning = false;
}
};
}
WD_API wd_error_t watchdog_unregister_device(wd_context *ctx) {
- if (ctx == nullptr || ctx->opaque == nullptr) {
- return WD_NOTINITIALIZED;
- } else {
- if (ctx->opaque != WD_OPAQUE_MAGIC) {
- auto watchee = reinterpret_cast<wd_context_opaque*>(ctx->opaque);
- // NOTE: magic field used to pass preallocated watchee - since this function only used by plugin, this is not a backdoor
- if (watchee->magic == WD_OPAQUE_MAGIC) {
- if (!WatchdogImpl::instance().remove_device(ctx->opaque)) {
- mvLog(MVLOG_WARN, "cannot remove device\n");
- return WD_FAIL;
+ try {
+ if (ctx == nullptr || ctx->opaque == nullptr) {
+ return WD_NOTINITIALIZED;
+ } else {
+ if (ctx->opaque != WD_OPAQUE_MAGIC) {
+ auto watchee = reinterpret_cast<wd_context_opaque *>(ctx->opaque);
+ // NOTE: magic field used to pass preallocated watchee - since this function only used by plugin, this is not a backdoor
+ if (watchee->magic == WD_OPAQUE_MAGIC) {
+ if (!WatchdogImpl::instance().remove_device(ctx->opaque)) {
+ mvLog(MVLOG_WARN, "cannot remove device\n");
+ return WD_FAIL;
+ }
}
}
}
- }
- if (ctx != nullptr) {
- // opaque pointer deleted
- ctx->opaque = nullptr;
+ if (ctx != nullptr) {
+ // opaque pointer deleted
+ ctx->opaque = nullptr;
+ }
+
+ return WD_ERRNO;
+ } catch (const std::exception & ex) {
+ mvLog(MVLOG_ERROR, "error %s", ex.what());
+ } catch (...) {
+ mvLog(MVLOG_ERROR, "unknown error");
}
- return WD_ERRNO;
+ return WD_FAIL;
}
## See Also
* [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md)
* [Model Optimizer](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md)
-* [Model Downloader](./tools/downloader/README.md)
\ No newline at end of file
+* [Model Downloader](./tools/downloader/README.md)
# install
install(TARGETS compile_tool
- RUNTIME DESTINATION ${IE_CPACK_LIBRARY_PATH}
+ RUNTIME DESTINATION ${IE_CPACK_RUNTIME_PATH}
COMPONENT core)
# install
install(TARGETS ${TARGET_NAME}
- RUNTIME DESTINATION ${IE_CPACK_LIBRARY_PATH}
+ RUNTIME DESTINATION ${IE_CPACK_RUNTIME_PATH}
COMPONENT myriad)
\ No newline at end of file
add_perfcheck_target(myriad_perfcheck myriadPlugin)
install(TARGETS myriad_perfcheck
- RUNTIME DESTINATION ${IE_CPACK_LIBRARY_PATH}
+ RUNTIME DESTINATION ${IE_CPACK_RUNTIME_PATH}
COMPONENT myriad)
endif()
--- /dev/null
+import argparse
+import os
+from shutil import rmtree
+
+from utils import Automation
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--build_number", type=str, help="Build number to be added to package version", default="0", )
+args = parser.parse_args()
+
+auto = Automation()
+base_dir = os.path.dirname(__file__)
+bom_path = os.path.join(base_dir, "package_BOM.txt")
+bom = auto.parse_bom(bom_path=bom_path)
+dir_to_tar = auto.copy_files_from_bom(root_path=os.path.join(os.path.dirname(__file__), ".."), bom=bom)
+auto.add_version_txt(dst_path=dir_to_tar, build_number=args.build_number)
+
+auto.make_tarfile(out_file_name="mo_for_tf_{0}.tar.gz".format(args.build_number), source_dir=dir_to_tar)
+rmtree(dir_to_tar)
--- /dev/null
+extensions/__init__.py
+extensions/analysis/__init__.py
+extensions/analysis/boolean_input.py
+extensions/analysis/inputs.py
+extensions/analysis/json_print.py
+extensions/analysis/nodes.py
+extensions/analysis/tf_od_api.py
+extensions/analysis/tf_retinanet.py
+extensions/analysis/tf_yolo.py
+extensions/back/__init__.py
+extensions/back/ActivationsNormalizer.py
+extensions/back/AvgPool.py
+extensions/back/blob_normalizer.py
+extensions/back/compress_quantized_weights.py
+extensions/back/ConvolutionNormalizer.py
+extensions/back/CropToStridedSlice.py
+extensions/back/CutMemory.py
+extensions/back/disable_unsupported_ND_operations.py
+extensions/back/DumpFakeQuantStat.py
+extensions/back/ElementwiseOpsToEltwiseOps.py
+extensions/back/EnableConstantStridedSlice.py
+extensions/back/ForceStrictPrecision.py
+extensions/back/fuse_sub_div_min.py
+extensions/back/FuseTransposesSequence.py
+extensions/back/GatherNormalizer.py
+extensions/back/GroupedConvWeightsNormalize.py
+extensions/back/I64ToI32.py
+extensions/back/insert_compatibility_l2normalization.py
+extensions/back/InterpolateToInterpOrResample.py
+extensions/back/kaldi_remove_memory_output.py
+extensions/back/LeakyReLUMutation.py
+extensions/back/LeakyReluToReluWithNegativeSlope.py
+extensions/back/LRNToNorm.py
+extensions/back/LSTMCellNormalizer.py
+extensions/back/MatMulNormalizer.py
+extensions/back/MaxPool.py
+extensions/back/NonMaximumSuppressionNormalize.py
+extensions/back/NormalizeToNormalizeL2.py
+extensions/back/OneHotNormalizer.py
+extensions/back/op_versioning.py
+extensions/back/OptimizeTransposeReshapeSequence.py
+extensions/back/PackBinaryWeights.py
+extensions/back/PadToV7.py
+extensions/back/ParameterToPlaceholder.py
+extensions/back/pass_separator.py
+extensions/back/priorbox_mutation.py
+extensions/back/ProposalMutation.py
+extensions/back/ReduceToPooling.py
+extensions/back/ReduceTransposeDimensions.py
+extensions/back/remove_last_softmax_pattern.py
+extensions/back/RemoveUselessConvert.py
+extensions/back/Reshape0DToSqueeze.py
+extensions/back/ReshapeMutation.py
+extensions/back/ResultNormalizer.py
+extensions/back/ReverseInputChannels.py
+extensions/back/RNNSequenceTypeRename.py
+extensions/back/ScalarConstNormalize.py
+extensions/back/SelectBroadcast.py
+extensions/back/ShapeOfToShape.py
+extensions/back/ShuffleChannelPatternOptimization.py
+extensions/back/ShufflenetReLUReorder.py
+extensions/back/SpecialNodesFinalization.py
+extensions/back/split_normalizer.py
+extensions/back/StridedSliceMasksNormalizer.py
+extensions/back/TileNormalizer.py
+extensions/back/TopKNormalizer.py
+extensions/back/TransposeReduceFusing.py
+extensions/back/TransposeToPermute.py
+extensions/back/UselessConcatRemoval.py
+extensions/front/__init__.py
+extensions/front/ArgMaxSqueeze.py
+extensions/front/ATenToEmbeddingBag.py
+extensions/front/AttributedGatherNormalizer.py
+extensions/front/AttributedPadToPad.py
+extensions/front/binary_quantize_normalization.py
+extensions/front/caffe/__init__.py
+extensions/front/caffe/accum_ext.py
+extensions/front/caffe/argmax_ext.py
+extensions/front/caffe/ArgMaxFlatten.py
+extensions/front/caffe/axpy.py
+extensions/front/caffe/binarization.py
+extensions/front/caffe/binary_conv_ext.py
+extensions/front/caffe/bn.py
+extensions/front/caffe/conv_ext.py
+extensions/front/caffe/correlation_ext.py
+extensions/front/caffe/ctcgreedydecoder_ext.py
+extensions/front/caffe/CustomLayersMapping.xml.example
+extensions/front/caffe/data_augmentation_ext.py
+extensions/front/caffe/detection_output.py
+extensions/front/caffe/elementwise_ext.py
+extensions/front/caffe/eltwise_add_normalize.py
+extensions/front/caffe/elu.py
+extensions/front/caffe/flatten_ext.py
+extensions/front/caffe/grn_ext.py
+extensions/front/caffe/inner_product_ext.py
+extensions/front/caffe/input_ext.py
+extensions/front/caffe/interp_ext.py
+extensions/front/caffe/lrn_ext.py
+extensions/front/caffe/mvn_ext.py
+extensions/front/caffe/normalize_ext.py
+extensions/front/caffe/permute_ext.py
+extensions/front/caffe/pooling_ext.py
+extensions/front/caffe/power_file_ext.py
+extensions/front/caffe/prelu_ext.py
+extensions/front/caffe/priorbox_clustered_ext.py
+extensions/front/caffe/priorbox_ext.py
+extensions/front/caffe/proposal_ext.py
+extensions/front/caffe/proposal_python_ext.py
+extensions/front/caffe/psroipooling_ext.py
+extensions/front/caffe/regionyolo_ext.py
+extensions/front/caffe/relu6.py
+extensions/front/caffe/relu_ext.py
+extensions/front/caffe/reorgyolo_ext.py
+extensions/front/caffe/resample_ext.py
+extensions/front/caffe/reshape.py
+extensions/front/caffe/ShuffleChannel.py
+extensions/front/caffe/shufflechannel_ext.py
+extensions/front/caffe/sigmoid.py
+extensions/front/caffe/simplernms_ext.py
+extensions/front/caffe/slice_to_split.py
+extensions/front/caffe/softmax_ext.py
+extensions/front/caffe/spatial_transformer_ext.py
+extensions/front/caffe/split_to_identity.py
+extensions/front/caffe/tanh.py
+extensions/front/ChangeCastOutputType.py
+extensions/front/ChangePlaceholderTypes.py
+extensions/front/create_tensor_nodes.py
+extensions/front/disable_weights_quantize_value_propagation.py
+extensions/front/div.py
+extensions/front/eltwise_n.py
+extensions/front/ExpandDimsToUnsqueeze.py
+extensions/front/FillToBroadcast.py
+extensions/front/flatten_to_reshape.py
+extensions/front/freeze_placeholder_value.py
+extensions/front/GeLUMerger_Erf.py
+extensions/front/global_pooling_to_reduce.py
+extensions/front/image_scaler.py
+extensions/front/input_cut.py
+extensions/front/instance_normalization.py
+extensions/front/InterpolateNormalizer.py
+extensions/front/kaldi/__init__.py
+extensions/front/kaldi/add_permute_after_convolution.py
+extensions/front/kaldi/add_reshape_around_convolution.py
+extensions/front/kaldi/add_reshape_around_pooling.py
+extensions/front/kaldi/apply_counts.py
+extensions/front/kaldi/logsoftmax_component_ext.py
+extensions/front/kaldi/memory_offset_adjustment.py
+extensions/front/kaldi/replace_eltwise_nin1.py
+extensions/front/kaldi/replace_lstm_node_pattern.py
+extensions/front/kaldi/replace_lstm_nonlinearity.py
+extensions/front/kaldi/set_ports.py
+extensions/front/kaldi/sigmoid_ext.py
+extensions/front/kaldi/split_memoryoffsets.py
+extensions/front/kaldi/tanh_component_ext.py
+extensions/front/Log1p.py
+extensions/front/LogSoftmax.py
+extensions/front/LRNReplacer.py
+extensions/front/MatMul_normalizer.py
+extensions/front/MoveEmbeddedInputsToInputs.py
+extensions/front/mxnet/__init__.py
+extensions/front/mxnet/activation.py
+extensions/front/mxnet/adaptive_avg_pooling_ext.py
+extensions/front/mxnet/add_input_data_to_prior_boxes.py
+extensions/front/mxnet/arange_ext.py
+extensions/front/mxnet/arange_replacer.py
+extensions/front/mxnet/block_grad_ext.py
+extensions/front/mxnet/box_nms_ext.py
+extensions/front/mxnet/cast_ext.py
+extensions/front/mxnet/check_softmax_node_inputs.py
+extensions/front/mxnet/clip_ext.py
+extensions/front/mxnet/conv_ext.py
+extensions/front/mxnet/copy_ext.py
+extensions/front/mxnet/crop_ext.py
+extensions/front/mxnet/custom.py
+extensions/front/mxnet/custom_rpn_proposal.py
+extensions/front/mxnet/deformable_conv_ext.py
+extensions/front/mxnet/deformable_psroi_pooling_ext.py
+extensions/front/mxnet/dropout_ext.py
+extensions/front/mxnet/elementwise_ext.py
+extensions/front/mxnet/eltwise_scalar_replacers.py
+extensions/front/mxnet/exp_ext.py
+extensions/front/mxnet/expand_dims_ext.py
+extensions/front/mxnet/flatten_ext.py
+extensions/front/mxnet/fully_connected_ext.py
+extensions/front/mxnet/gather.py
+extensions/front/mxnet/gather_ext.py
+extensions/front/mxnet/instance_norm_ext.py
+extensions/front/mxnet/leaky_relu.py
+extensions/front/mxnet/lrn_ext.py
+extensions/front/mxnet/max_ext.py
+extensions/front/mxnet/multibox_detection_ext.py
+extensions/front/mxnet/mx_reshape_reverse.py
+extensions/front/mxnet/mx_reshape_to_reshape.py
+extensions/front/mxnet/MXRepeatReplacer.py
+extensions/front/mxnet/null_ext.py
+extensions/front/mxnet/pad_ext.py
+extensions/front/mxnet/pooling_ext.py
+extensions/front/mxnet/proposal_ext.py
+extensions/front/mxnet/psroi_pooling_ext.py
+extensions/front/mxnet/repeat_ext.py
+extensions/front/mxnet/reshape_ext.py
+extensions/front/mxnet/RNN_ext.py
+extensions/front/mxnet/rnn_param_concat.py
+extensions/front/mxnet/roi_pooling_ext.py
+extensions/front/mxnet/shape_array_ext.py
+extensions/front/mxnet/sigmoid.py
+extensions/front/mxnet/slice_channel_ext.py
+extensions/front/mxnet/slice_ext.py
+extensions/front/mxnet/slice_like_ext.py
+extensions/front/mxnet/slice_replacers.py
+extensions/front/mxnet/softmax.py
+extensions/front/mxnet/softmax_activation_ext.py
+extensions/front/mxnet/softmax_ext.py
+extensions/front/mxnet/softmax_output_ext.py
+extensions/front/mxnet/squeeze_ext.py
+extensions/front/mxnet/ssd_anchor_reshape.py
+extensions/front/mxnet/ssd_detection_output_replacer.py
+extensions/front/mxnet/ssd_pattern_flatten_softmax_activation.py
+extensions/front/mxnet/ssd_pattern_remove_flatten.py
+extensions/front/mxnet/ssd_pattern_remove_reshape.py
+extensions/front/mxnet/ssd_pattern_remove_transpose.py
+extensions/front/mxnet/ssd_reorder_detection_out_inputs.py
+extensions/front/mxnet/stack_ext.py
+extensions/front/mxnet/swapaxis_ext.py
+extensions/front/mxnet/tile_ext.py
+extensions/front/mxnet/tile_replacer.py
+extensions/front/mxnet/transpose_ext.py
+extensions/front/mxnet/up_sampling_ext.py
+extensions/front/mxnet/where_ext.py
+extensions/front/mxnet/yolo_v3_mobilenet1_voc.json
+extensions/front/mxnet/zeros_ext.py
+extensions/front/no_op_eraser.py
+extensions/front/onnx/__init__.py
+extensions/front/onnx/activation_ext.py
+extensions/front/onnx/affine_ext.py
+extensions/front/onnx/argmax_ext.py
+extensions/front/onnx/aten_ext.py
+extensions/front/onnx/cast_ext.py
+extensions/front/onnx/clip_ext.py
+extensions/front/onnx/const_ext.py
+extensions/front/onnx/constant_fill_ext.py
+extensions/front/onnx/constant_of_shape_ext.py
+extensions/front/onnx/constant_of_shape_to_broadcast.py
+extensions/front/onnx/conv_ext.py
+extensions/front/onnx/crop_ext.py
+extensions/front/onnx/deformable_conv_ext.py
+extensions/front/onnx/detection_output.py
+extensions/front/onnx/detectionoutput_ext.py
+extensions/front/onnx/dropout_ext.py
+extensions/front/onnx/elementwise_ext.py
+extensions/front/onnx/expand_ext.py
+extensions/front/onnx/flatten_ext.py
+extensions/front/onnx/flattenONNX_to_reshape.py
+extensions/front/onnx/gather_ext.py
+extensions/front/onnx/gemm_ext.py
+extensions/front/onnx/group_norm_ext.py
+extensions/front/onnx/gru_ext.py
+extensions/front/onnx/hard_sigmoid_ext.py
+extensions/front/onnx/image_scaler_ext.py
+extensions/front/onnx/instance_normalization_ext.py
+extensions/front/onnx/lp_normalization_ext.py
+extensions/front/onnx/lrn_ext.py
+extensions/front/onnx/lstm_ext.py
+extensions/front/onnx/mask_rcnn.json
+extensions/front/onnx/mask_rcnn_conversion.py
+extensions/front/onnx/matmul_ext.py
+extensions/front/onnx/mean_variance_normalization_ext.py
+extensions/front/onnx/non_max_suppression_ext.py
+extensions/front/onnx/non_max_suppression_normalize.py
+extensions/front/onnx/non_zero_ext.py
+extensions/front/onnx/normalize_ext.py
+extensions/front/onnx/normalize_l2_normalize.py
+extensions/front/onnx/one_hot_ext.py
+extensions/front/onnx/one_hot_normalize.py
+extensions/front/onnx/pad_ext.py
+extensions/front/onnx/parameter_ext.py
+extensions/front/onnx/person_detection_crossroad.json
+extensions/front/onnx/person_detection_crossroad_conversion.py
+extensions/front/onnx/pooling_ext.py
+extensions/front/onnx/priorbox_clustered_ext.py
+extensions/front/onnx/priorbox_ext.py
+extensions/front/onnx/priorgridgenerator_ext.py
+extensions/front/onnx/proposal_ext.py
+extensions/front/onnx/quantize_ext.py
+extensions/front/onnx/reduce_max_ext.py
+extensions/front/onnx/reduce_mean_ext.py
+extensions/front/onnx/reduce_min_ext.py
+extensions/front/onnx/reduce_prod_ext.py
+extensions/front/onnx/reduce_sum_ext.py
+extensions/front/onnx/remove_filtering_boxes_by_size.py
+extensions/front/onnx/resize_ext.py
+extensions/front/onnx/resize_to_interpolate.py
+extensions/front/onnx/reverse_sequence_ext.py
+extensions/front/onnx/rnn_ext.py
+extensions/front/onnx/roialign_ext.py
+extensions/front/onnx/roifeatureextractor_ext.py
+extensions/front/onnx/scatter_ext.py
+extensions/front/onnx/shape_ext.py
+extensions/front/onnx/slice_ext.py
+extensions/front/onnx/softmax_ext.py
+extensions/front/onnx/split_ext.py
+extensions/front/onnx/squeeze_ext.py
+extensions/front/onnx/top_k_ext.py
+extensions/front/onnx/topkrois_ext.py
+extensions/front/onnx/transpose_ext.py
+extensions/front/onnx/unsqueeze_ext.py
+extensions/front/onnx/upsample_ext.py
+extensions/front/output_cut.py
+extensions/front/override_batch.py
+extensions/front/Pack.py
+extensions/front/pass_separator.py
+extensions/front/PowerToEltwises.py
+extensions/front/rank_decomposer.py
+extensions/front/reciprocal.py
+extensions/front/reduce_axis_normalizer.py
+extensions/front/reshape_dim_normalizer.py
+extensions/front/restore_ports.py
+extensions/front/scatter_normalizer.py
+extensions/front/softmax.py
+extensions/front/softsign_replacer.py
+extensions/front/split_normalizer.py
+extensions/front/squared_difference.py
+extensions/front/SqueezeNormalize.py
+extensions/front/standalone_const_eraser.py
+extensions/front/sub.py
+extensions/front/tf/__init__.py
+extensions/front/tf/activation_ext.py
+extensions/front/tf/argmax_ext.py
+extensions/front/tf/assign_elimination.py
+extensions/front/tf/basic_lstm_cell.py
+extensions/front/tf/batch_to_space_ext.py
+extensions/front/tf/BatchMatMul_ext.py
+extensions/front/tf/BatchToSpaceNDToUpsample.py
+extensions/front/tf/BlockLSTM.py
+extensions/front/tf/BlockLSTM_ext.py
+extensions/front/tf/bucketize.py
+extensions/front/tf/bucketize_ext.py
+extensions/front/tf/Cast_ext.py
+extensions/front/tf/concat.py
+extensions/front/tf/concat_ext.py
+extensions/front/tf/const_ext.py
+extensions/front/tf/conv_ext.py
+extensions/front/tf/crop_and_resize_ext.py
+extensions/front/tf/CropAndResizeReplacement.py
+extensions/front/tf/CTCGreedyDecoder.py
+extensions/front/tf/CTCGreedyDecoder_ext.py
+extensions/front/tf/deconv_ext.py
+extensions/front/tf/depth_to_space.py
+extensions/front/tf/elementwise_ext.py
+extensions/front/tf/expand_dims_ext.py
+extensions/front/tf/extract_image_patches_ext.py
+extensions/front/tf/fake_const_ext.py
+extensions/front/tf/FakeQuantWithMinMaxVars.py
+extensions/front/tf/FakeQuantWithMinMaxVars_ext.py
+extensions/front/tf/faster_rcnn_support.json
+extensions/front/tf/faster_rcnn_support_api_v1.10.json
+extensions/front/tf/faster_rcnn_support_api_v1.13.json
+extensions/front/tf/faster_rcnn_support_api_v1.14.json
+extensions/front/tf/faster_rcnn_support_api_v1.15.json
+extensions/front/tf/faster_rcnn_support_api_v1.7.json
+extensions/front/tf/fifo_queue_v2_ext.py
+extensions/front/tf/fifo_replacer.py
+extensions/front/tf/fill_ext.py
+extensions/front/tf/FlattenToReshape.py
+extensions/front/tf/floor_ext.py
+extensions/front/tf/gather_ext.py
+extensions/front/tf/GatherTree_ext.py
+extensions/front/tf/GNMT_DynamicSequenceLengths.py
+extensions/front/tf/identity_ext.py
+extensions/front/tf/InterpolateTransposes.py
+extensions/front/tf/IteratorGetNext_ext.py
+extensions/front/tf/LoopCond_ext.py
+extensions/front/tf/lrn_ext.py
+extensions/front/tf/mask_rcnn_support.json
+extensions/front/tf/mask_rcnn_support_api_v1.11.json
+extensions/front/tf/mask_rcnn_support_api_v1.13.json
+extensions/front/tf/mask_rcnn_support_api_v1.14.json
+extensions/front/tf/mask_rcnn_support_api_v1.15.json
+extensions/front/tf/mask_rcnn_support_api_v1.7.json
+extensions/front/tf/matmul_ext.py
+extensions/front/tf/mvn.py
+extensions/front/tf/mvn_unrolled.py
+extensions/front/tf/nearest_neighbor_upsampling.py
+extensions/front/tf/next_iteration_ext.py
+extensions/front/tf/non_max_suppression_ext.py
+extensions/front/tf/non_max_suppression_normalize.py
+extensions/front/tf/ObjectDetectionAPI.py
+extensions/front/tf/one_hot_ext.py
+extensions/front/tf/pad_ext.py
+extensions/front/tf/pad_tf_to_pad.py
+extensions/front/tf/placeholder_ext.py
+extensions/front/tf/placeholder_with_default_ext.py
+extensions/front/tf/pooling_ext.py
+extensions/front/tf/prelu.py
+extensions/front/tf/reduce_ext.py
+extensions/front/tf/reshape_related_ext.py
+extensions/front/tf/resize_bilinear.py
+extensions/front/tf/resize_nearest_neighbor.py
+extensions/front/tf/retinanet.json
+extensions/front/tf/RetinaNetFilteredDetectionsReplacement.py
+extensions/front/tf/reverse_sequence.py
+extensions/front/tf/reverse_v2.py
+extensions/front/tf/rfcn_support.json
+extensions/front/tf/rfcn_support_api_v1.10.json
+extensions/front/tf/rfcn_support_api_v1.13.json
+extensions/front/tf/rfcn_support_api_v1.14.json
+extensions/front/tf/select_ext.py
+extensions/front/tf/sign_ext.py
+extensions/front/tf/SizeReplacer.py
+extensions/front/tf/slice_ext.py
+extensions/front/tf/softmax_ext.py
+extensions/front/tf/space_to_batch.py
+extensions/front/tf/space_to_batch_ext.py
+extensions/front/tf/space_to_depth_ext.py
+extensions/front/tf/sparse_fill_empty_rows_ext.py
+extensions/front/tf/sparse_segment_mean_ext.py
+extensions/front/tf/sparse_segment_sqrtn_ext.py
+extensions/front/tf/sparse_segment_sum_ext.py
+extensions/front/tf/sparse_to_dense_ext.py
+extensions/front/tf/sparse_weighted_sum.py
+extensions/front/tf/split_ext.py
+extensions/front/tf/SplitConcatPairToInterpolate.py
+extensions/front/tf/ssd_support.json
+extensions/front/tf/ssd_support_api_v1.14.json
+extensions/front/tf/ssd_support_api_v1.15.json
+extensions/front/tf/ssd_toolbox_detection_output.json
+extensions/front/tf/ssd_toolbox_multihead_detection_output.json
+extensions/front/tf/ssd_v2_support.json
+extensions/front/tf/SSDToolboxDetectionOutput.py
+extensions/front/tf/swap_deconv_inputs.py
+extensions/front/tf/swish.py
+extensions/front/tf/SwitchMergeOptimization.py
+extensions/front/tf/TensorArrayExtractors.py
+extensions/front/tf/TensorArrayGatherV3.py
+extensions/front/tf/tensorflow_custom_operations_config_update.py
+extensions/front/tf/tile_ext.py
+extensions/front/tf/topk_ext.py
+extensions/front/tf/transpose_ext.py
+extensions/front/tf/transposed_mvn_unrolled.py
+extensions/front/tf/unique_ext.py
+extensions/front/tf/UnpackPackReverseInputChannels.py
+extensions/front/tf/variable_ext.py
+extensions/front/tf/variables_values_freezing.py
+extensions/front/tf/yolo_v1.json
+extensions/front/tf/yolo_v1_tiny.json
+extensions/front/tf/yolo_v2.json
+extensions/front/tf/yolo_v2_tiny.json
+extensions/front/tf/yolo_v2_tiny_voc.json
+extensions/front/tf/yolo_v2_voc.json
+extensions/front/tf/yolo_v3.json
+extensions/front/tf/yolo_v3_tiny.json
+extensions/front/tf/yolo_v3_voc.json
+extensions/front/TopKNormalize.py
+extensions/front/transformations_config.py
+extensions/front/TransposeOrderNormalizer.py
+extensions/front/user_data_repack.py
+extensions/front/YOLO.py
+extensions/load/__init__.py
+extensions/load/caffe/__init__.py
+extensions/load/caffe/loader.py
+extensions/load/kaldi/__init__.py
+extensions/load/kaldi/loader.py
+extensions/load/loader.py
+extensions/load/mxnet/__init__.py
+extensions/load/mxnet/loader.py
+extensions/load/onnx/__init__.py
+extensions/load/onnx/loader.py
+extensions/load/tf/__init__.py
+extensions/load/tf/loader.py
+extensions/middle/__init__.py
+extensions/middle/AddFakeQuantizeFuse.py
+extensions/middle/AddIsCyclicAttribute.py
+extensions/middle/AddMeanScaleValues.py
+extensions/middle/AnchorToPriorBox.py
+extensions/middle/ApplyNHWCtoNCHWpermutation.py
+extensions/middle/ApplyPermutations.py
+extensions/middle/ArgMaxToTopK.py
+extensions/middle/AttributedTileNormalizer.py
+extensions/middle/BiasAddBroadcasting.py
+extensions/middle/BinarizeWeightsM1P1.py
+extensions/middle/BlockLSTMtoLSTMSequence.py
+extensions/middle/CheckForCycle.py
+extensions/middle/ConcatOptimization.py
+extensions/middle/ConstSwitchResolver.py
+extensions/middle/ConvertGroupedStridedSlice.py
+extensions/middle/ConvertLayoutDependentOperations.py
+extensions/middle/ConvertMultiInputConv.py
+extensions/middle/ConvToBinaryConv.py
+extensions/middle/CustomSubgraphCall.py
+extensions/middle/CutInputHavingZeroDimFromConcat.py
+extensions/middle/DecomposeBias.py
+extensions/middle/DecomposeBidirectionalRNNSequence.py
+extensions/middle/Deconvolution3rdInputNormalization.py
+extensions/middle/DeleteControlFlowEdges.py
+extensions/middle/DeleteNotExecutable.py
+extensions/middle/DepthToSpace.py
+extensions/middle/DilatedConvolution.py
+extensions/middle/EltwiseChecker.py
+extensions/middle/EltwiseInputNormalization.py
+extensions/middle/EltwiseInputReshape.py
+extensions/middle/EmbeddingBagResolver.py
+extensions/middle/FakeSplitOutputs.py
+extensions/middle/FusedBatchNormNonConstant.py
+extensions/middle/FusedBatchNormTraining.py
+extensions/middle/FuseReshapesSequence.py
+extensions/middle/fusings.py
+extensions/middle/GatherNdNormalizer.py
+extensions/middle/GroupNorm.py
+extensions/middle/GRURNNSequenceToTensorIterator.py
+extensions/middle/InputCut.py
+extensions/middle/InsertLayoutPropagationTransposes.py
+extensions/middle/InsertSelect.py
+extensions/middle/InterpolateSequenceToInterpolate.py
+extensions/middle/L2NormToNorm.py
+extensions/middle/LayoutChangeForConstantShapePaths.py
+extensions/middle/LeakyReluPattern.py
+extensions/middle/LSTMRNNSequenceToTensorIterator.py
+extensions/middle/MinimumMiddleReplacer.py
+extensions/middle/MulAddToSS.py
+extensions/middle/MulFakeQuantizeFuse.py
+extensions/middle/MXNetRNNSequenceNormalize.py
+extensions/middle/MXNetSplitMultiLayers.py
+extensions/middle/MXTileReplacer.py
+extensions/middle/NasNet.py
+extensions/middle/ONNXRNNSequenceNormalize.py
+extensions/middle/PartialInfer.py
+extensions/middle/pass_separator.py
+extensions/middle/permute_tensor_iterator.py
+extensions/middle/preprocessing.py
+extensions/middle/quantize_fuses.py
+extensions/middle/ReluQuantizeFuse.py
+extensions/middle/RemoveDuplicationMemory.py
+extensions/middle/RemoveIdentity.py
+extensions/middle/RemoveRedundantReshapeAfterCropAndResize.py
+extensions/middle/RemoveRedundantReshapes.py
+extensions/middle/RemoveUselessConcatSplit.py
+extensions/middle/RemoveUselessCrops.py
+extensions/middle/RemoveUselessPad.py
+extensions/middle/ReplaceMemoryOffsetWithSplice.py
+extensions/middle/ReplacePNorm.py
+extensions/middle/ReplaceSpliceNodePattern.py
+extensions/middle/reverse_tensor_iterator.py
+extensions/middle/ReverseTransposeNormalization.py
+extensions/middle/ReverseV2ToReverseSequence.py
+extensions/middle/RNNSequenceNormalizeToIE.py
+extensions/middle/ScaleInput.py
+extensions/middle/SharedWeightsDuplication.py
+extensions/middle/SliceConverter.py
+extensions/middle/space_to_depth.py
+extensions/middle/sparse_reshape.py
+extensions/middle/ssd_anchors_to_const.py
+extensions/middle/SwapAxesMiddleReplacer.py
+extensions/middle/TensorIterator_utils.py
+extensions/middle/TensorIteratorBackEdge.py
+extensions/middle/TensorIteratorCondition.py
+extensions/middle/TensorIteratorConditionChecker.py
+extensions/middle/TensorIteratorInput.py
+extensions/middle/TensorIteratorLSTMToLSTMSequence.py
+extensions/middle/TensorIteratorMerge.py
+extensions/middle/TensorIteratorOutput.py
+extensions/middle/TF_lstm_cell_to_generic.py
+extensions/middle/UnsqueezeTileReshapeBlockToInterpolate.py
+extensions/middle/UpsampleToResample.py
+extensions/middle/UselessMerge.py
+extensions/middle/UselessSplitEraser.py
+extensions/middle/UselessStridedSlice.py
+extensions/middle/wights_permute_normalizer.py
+extensions/ops/__init__.py
+extensions/ops/accum.py
+extensions/ops/activation_ops.py
+extensions/ops/adaptive_avg_pooling.py
+extensions/ops/argmax.py
+extensions/ops/assert_op.py
+extensions/ops/aten.py
+extensions/ops/axpy.py
+extensions/ops/binarization.py
+extensions/ops/BlockLSTM.py
+extensions/ops/bn.py
+extensions/ops/box_nms.py
+extensions/ops/bucketize.py
+extensions/ops/Cast.py
+extensions/ops/constant_fill.py
+extensions/ops/copyop.py
+extensions/ops/correlation.py
+extensions/ops/ctc_greedy_decoder.py
+extensions/ops/data_augmentation.py
+extensions/ops/depth_to_space.py
+extensions/ops/DetectionOutput.py
+extensions/ops/detectionoutput_onnx.py
+extensions/ops/elementwise.py
+extensions/ops/embedding_bag.py
+extensions/ops/Enter.py
+extensions/ops/Exit.py
+extensions/ops/exp.py
+extensions/ops/fakequantize.py
+extensions/ops/gather.py
+extensions/ops/GatherNd.py
+extensions/ops/GatherTree.py
+extensions/ops/gelu.py
+extensions/ops/grn.py
+extensions/ops/GRU.py
+extensions/ops/GRUCell.py
+extensions/ops/hard_sigmoid.py
+extensions/ops/identity.py
+extensions/ops/instance_normalization.py
+extensions/ops/interp.py
+extensions/ops/interpolate.py
+extensions/ops/Log.py
+extensions/ops/LSTM.py
+extensions/ops/lstm_cell.py
+extensions/ops/lstm_sequence.py
+extensions/ops/MatMul.py
+extensions/ops/merge.py
+extensions/ops/mvn.py
+extensions/ops/mxrepeat.py
+extensions/ops/mxreshape.py
+extensions/ops/mxslice.py
+extensions/ops/NextIteration.py
+extensions/ops/non_max_suppression.py
+extensions/ops/non_zero.py
+extensions/ops/normalize.py
+extensions/ops/normalize_l2.py
+extensions/ops/one_hot.py
+extensions/ops/pack.py
+extensions/ops/parameter.py
+extensions/ops/pnorm.py
+extensions/ops/power_file.py
+extensions/ops/prediction_heatmap.py
+extensions/ops/prelu.py
+extensions/ops/priorbox.py
+extensions/ops/priorbox_clustered.py
+extensions/ops/priorgridgenerator_onnx.py
+extensions/ops/proposal.py
+extensions/ops/proposal_onnx.py
+extensions/ops/proposal_python_example.py
+extensions/ops/psroipooling.py
+extensions/ops/range.py
+extensions/ops/rank.py
+extensions/ops/ReduceOps.py
+extensions/ops/regionyolo.py
+extensions/ops/reorgyolo.py
+extensions/ops/resample.py
+extensions/ops/resize.py
+extensions/ops/resize_factor_utils.py
+extensions/ops/Reverse.py
+extensions/ops/reverse_sequence.py
+extensions/ops/RNN.py
+extensions/ops/RNNCell.py
+extensions/ops/roialign.py
+extensions/ops/roifeatureextractor_onnx.py
+extensions/ops/scatter.py
+extensions/ops/select.py
+extensions/ops/shufflechannel.py
+extensions/ops/simplernms.py
+extensions/ops/size.py
+extensions/ops/space_to_depth.py
+extensions/ops/sparse_fill_empty_rows.py
+extensions/ops/sparse_reshape.py
+extensions/ops/sparse_segment_mean.py
+extensions/ops/sparse_segment_sqrtn.py
+extensions/ops/sparse_segment_sum.py
+extensions/ops/sparse_to_dense.py
+extensions/ops/sparse_weighted_sum.py
+extensions/ops/spatial_transformer.py
+extensions/ops/splice.py
+extensions/ops/split.py
+extensions/ops/stop_gradient.py
+extensions/ops/swapaxis.py
+extensions/ops/switch.py
+extensions/ops/tensor_iterator.py
+extensions/ops/TensorArray.py
+extensions/ops/TensorArrayGather.py
+extensions/ops/TensorArrayRead.py
+extensions/ops/TensorArrayScatter.py
+extensions/ops/TensorArraySize.py
+extensions/ops/TensorArrayWrite.py
+extensions/ops/TensorIterator_ops.py
+extensions/ops/topk.py
+extensions/ops/topkrois_onnx.py
+extensions/ops/transpose.py
+extensions/ops/unique.py
+extensions/ops/upsample.py
+install_prerequisites/install_prerequisites.bat
+install_prerequisites/install_prerequisites.sh
+install_prerequisites/install_prerequisites_caffe.bat
+install_prerequisites/install_prerequisites_caffe.sh
+install_prerequisites/install_prerequisites_kaldi.bat
+install_prerequisites/install_prerequisites_kaldi.sh
+install_prerequisites/install_prerequisites_mxnet.bat
+install_prerequisites/install_prerequisites_mxnet.sh
+install_prerequisites/install_prerequisites_onnx.bat
+install_prerequisites/install_prerequisites_onnx.sh
+install_prerequisites/install_prerequisites_tf.bat
+install_prerequisites/install_prerequisites_tf.sh
+install_prerequisites/protobuf-3.6.1-py3.4-win-amd64.egg
+install_prerequisites/protobuf-3.6.1-py3.5-win-amd64.egg
+install_prerequisites/protobuf-3.6.1-py3.6-win-amd64.egg
+install_prerequisites/protobuf-3.6.1-py3.7-win-amd64.egg
+mo.py
+mo/__init__.py
+mo/back/__init__.py
+mo/back/ie_ir_ver_2/__init__.py
+mo/back/ie_ir_ver_2/emitter.py
+mo/back/replacement.py
+mo/front/__init__.py
+mo/front/caffe/__init__.py
+mo/front/caffe/collect_attributes.py
+mo/front/caffe/custom_layers_mapping.py
+mo/front/caffe/extractor.py
+mo/front/caffe/extractors/__init__.py
+mo/front/caffe/extractors/batchnorm.py
+mo/front/caffe/extractors/concat.py
+mo/front/caffe/extractors/crop.py
+mo/front/caffe/extractors/native_caffe.py
+mo/front/caffe/extractors/roipooling.py
+mo/front/caffe/extractors/scale.py
+mo/front/caffe/extractors/slice.py
+mo/front/caffe/extractors/tile.py
+mo/front/caffe/extractors/utils.py
+mo/front/caffe/loader.py
+mo/front/caffe/proto/__init__.py
+mo/front/caffe/proto/caffe_pb2.py
+mo/front/caffe/proto/generate_caffe_pb2.py
+mo/front/caffe/proto/mo_caffe.proto
+mo/front/caffe/python_layer_extractor.py
+mo/front/caffe/register_custom_ops.py
+mo/front/common/__init__.py
+mo/front/common/custom_replacement_registry.py
+mo/front/common/extractors/utils.py
+mo/front/common/find_unsupported_ops.py
+mo/front/common/layout.py
+mo/front/common/partial_infer/__init__.py
+mo/front/common/partial_infer/batch_norm.py
+mo/front/common/partial_infer/caffe_fallback.py
+mo/front/common/partial_infer/concat.py
+mo/front/common/partial_infer/crop.py
+mo/front/common/partial_infer/elemental.py
+mo/front/common/partial_infer/eltwise.py
+mo/front/common/partial_infer/multi_box_detection.py
+mo/front/common/partial_infer/multi_box_prior.py
+mo/front/common/partial_infer/random_uniform.py
+mo/front/common/partial_infer/reshape.py
+mo/front/common/partial_infer/roipooling.py
+mo/front/common/partial_infer/slice.py
+mo/front/common/partial_infer/utils.py
+mo/front/common/register_custom_ops.py
+mo/front/common/replacement.py
+mo/front/common/weights.py
+mo/front/extractor.py
+mo/front/kaldi/__init__.py
+mo/front/kaldi/extractor.py
+mo/front/kaldi/extractors/__init__.py
+mo/front/kaldi/extractors/add_ext.py
+mo/front/kaldi/extractors/add_shift_ext.py
+mo/front/kaldi/extractors/affine_component_ext.py
+mo/front/kaldi/extractors/affine_component_preconditioned_online_ext.py
+mo/front/kaldi/extractors/affine_transform_ext.py
+mo/front/kaldi/extractors/backproptruncation_ext.py
+mo/front/kaldi/extractors/batchnorm_component_ext.py
+mo/front/kaldi/extractors/clip_ext.py
+mo/front/kaldi/extractors/concat_ext.py
+mo/front/kaldi/extractors/convolutional_1d_component_ext.py
+mo/front/kaldi/extractors/convolutional_component_ext.py
+mo/front/kaldi/extractors/copy_ext.py
+mo/front/kaldi/extractors/crop_ext.py
+mo/front/kaldi/extractors/elementwise_component_ext.py
+mo/front/kaldi/extractors/fixed_affine_component_ext.py
+mo/front/kaldi/extractors/linear_component_ext.py
+mo/front/kaldi/extractors/lstm_nonlinearity_ext.py
+mo/front/kaldi/extractors/lstm_projected_streams_ext.py
+mo/front/kaldi/extractors/max_pooling_ext.py
+mo/front/kaldi/extractors/memoryoffset_ext.py
+mo/front/kaldi/extractors/naturalgradient_affine_component_ext.py
+mo/front/kaldi/extractors/noop_ext.py
+mo/front/kaldi/extractors/normalize_component_ext.py
+mo/front/kaldi/extractors/pnorm_component_ext.py
+mo/front/kaldi/extractors/rectified_linear_component_ext.py
+mo/front/kaldi/extractors/rescale_ext.py
+mo/front/kaldi/extractors/scale_component_ext.py
+mo/front/kaldi/extractors/slice_ext.py
+mo/front/kaldi/extractors/softmax_ext.py
+mo/front/kaldi/extractors/splice_component_ext.py
+mo/front/kaldi/loader/__init__.py
+mo/front/kaldi/loader/loader.py
+mo/front/kaldi/loader/utils.py
+mo/front/kaldi/register_custom_ops.py
+mo/front/kaldi/utils.py
+mo/front/mxnet/__init__.py
+mo/front/mxnet/extractor.py
+mo/front/mxnet/extractors/__init__.py
+mo/front/mxnet/extractors/add_n.py
+mo/front/mxnet/extractors/batchnorm.py
+mo/front/mxnet/extractors/concat.py
+mo/front/mxnet/extractors/l2_normalization.py
+mo/front/mxnet/extractors/multibox_prior.py
+mo/front/mxnet/extractors/relu.py
+mo/front/mxnet/extractors/scaleshift.py
+mo/front/mxnet/extractors/slice_axis.py
+mo/front/mxnet/extractors/utils.py
+mo/front/mxnet/loader.py
+mo/front/mxnet/nd_to_params.py
+mo/front/mxnet/register_custom_ops.py
+mo/front/onnx/__init__.py
+mo/front/onnx/extractor.py
+mo/front/onnx/extractors/__init__.py
+mo/front/onnx/extractors/concat.py
+mo/front/onnx/extractors/eltwise.py
+mo/front/onnx/extractors/fused_bn.py
+mo/front/onnx/extractors/reshape.py
+mo/front/onnx/extractors/utils.py
+mo/front/onnx/loader.py
+mo/front/onnx/register_custom_ops.py
+mo/front/subgraph_matcher.py
+mo/front/tf/__init__.py
+mo/front/tf/common.py
+mo/front/tf/custom_subgraph_call.py
+mo/front/tf/extractor.py
+mo/front/tf/extractors/__init__.py
+mo/front/tf/extractors/concat.py
+mo/front/tf/extractors/fused_bn.py
+mo/front/tf/extractors/identity.py
+mo/front/tf/extractors/native_tf.py
+mo/front/tf/extractors/pack.py
+mo/front/tf/extractors/random_uniform.py
+mo/front/tf/extractors/strided_slice.py
+mo/front/tf/extractors/utils.py
+mo/front/tf/graph_utils.py
+mo/front/tf/loader.py
+mo/front/tf/partial_infer/__init__.py
+mo/front/tf/partial_infer/tf.py
+mo/front/tf/register_custom_ops.py
+mo/front/tf/replacement.py
+mo/graph/__init__.py
+mo/graph/connection.py
+mo/graph/graph.py
+mo/graph/perm_inputs.py
+mo/graph/port.py
+mo/main.py
+mo/middle/__init__.py
+mo/middle/passes/__init__.py
+mo/middle/passes/conv.py
+mo/middle/passes/convert_data_type.py
+mo/middle/passes/debug.py
+mo/middle/passes/eliminate.py
+mo/middle/passes/fusing/__init__.py
+mo/middle/passes/fusing/decomposition.py
+mo/middle/passes/fusing/fuse_grouped_conv.py
+mo/middle/passes/fusing/fuse_linear_ops.py
+mo/middle/passes/fusing/fuse_linear_seq.py
+mo/middle/passes/fusing/helpers.py
+mo/middle/passes/fusing/mark_unfused_nodes.py
+mo/middle/passes/fusing/resnet_optimization.py
+mo/middle/passes/infer.py
+mo/middle/passes/leaky_relu.py
+mo/middle/passes/mean_scale_values.py
+mo/middle/passes/tensor_names.py
+mo/middle/pattern_match.py
+mo/middle/replacement.py
+mo/ops/__init__.py
+mo/ops/activation.py
+mo/ops/broadcast.py
+mo/ops/clamp.py
+mo/ops/concat.py
+mo/ops/const.py
+mo/ops/constant_of_shape.py
+mo/ops/convolution.py
+mo/ops/crop.py
+mo/ops/deconvolution.py
+mo/ops/deformable_convolution.py
+mo/ops/eltwise.py
+mo/ops/eltwise_n.py
+mo/ops/eltwise_ninputs_in_1.py
+mo/ops/expand_dims.py
+mo/ops/fill.py
+mo/ops/flatten.py
+mo/ops/group_norm.py
+mo/ops/lrn.py
+mo/ops/lstmnonlinearity.py
+mo/ops/memory.py
+mo/ops/memoryoffset.py
+mo/ops/op.py
+mo/ops/pad.py
+mo/ops/permute.py
+mo/ops/pooling.py
+mo/ops/power.py
+mo/ops/reshape.py
+mo/ops/result.py
+mo/ops/roipooling.py
+mo/ops/scale_shift.py
+mo/ops/shape.py
+mo/ops/slice.py
+mo/ops/softmax.py
+mo/ops/space_to_batch.py
+mo/ops/squeeze.py
+mo/ops/strided_slice.py
+mo/ops/tile.py
+mo/ops/unsqueeze.py
+mo/pipeline/__init__.py
+mo/pipeline/common.py
+mo/pipeline/unified.py
+mo/utils/__init__.py
+mo/utils/class_registration.py
+mo/utils/cli_parser.py
+mo/utils/custom_replacement_config.py
+mo/utils/dsu.py
+mo/utils/error.py
+mo/utils/find_inputs.py
+mo/utils/graph.py
+mo/utils/guess_framework.py
+mo/utils/import_extensions.py
+mo/utils/ir_engine/__init__.py
+mo/utils/ir_engine/compare_graphs.py
+mo/utils/ir_engine/ir_engine.py
+mo/utils/ir_reader/__init__.py
+mo/utils/ir_reader/extender.py
+mo/utils/ir_reader/extenders/binary_convolution_extender.py
+mo/utils/ir_reader/extenders/conv_extender.py
+mo/utils/ir_reader/extenders/convert_extender.py
+mo/utils/ir_reader/extenders/deconvolution_extender.py
+mo/utils/ir_reader/extenders/deformable_convolution_extender.py
+mo/utils/ir_reader/extenders/experimental_extender.py
+mo/utils/ir_reader/extenders/fakequantize_extender.py
+mo/utils/ir_reader/extenders/GRUCell_extender.py
+mo/utils/ir_reader/extenders/interpolate_extender.py
+mo/utils/ir_reader/extenders/LSTMCell_extender.py
+mo/utils/ir_reader/extenders/non_zero_extender.py
+mo/utils/ir_reader/extenders/pad_extender.py
+mo/utils/ir_reader/extenders/parameter_extender.py
+mo/utils/ir_reader/extenders/pooling_extender.py
+mo/utils/ir_reader/extenders/priorbox_clustered_extender.py
+mo/utils/ir_reader/extenders/priorbox_extender.py
+mo/utils/ir_reader/extenders/reorg_yolo_extender.py
+mo/utils/ir_reader/extenders/RNNCell_extender.py
+mo/utils/ir_reader/extenders/strided_slice_extender.py
+mo/utils/ir_reader/extenders/tensoriterator_extender.py
+mo/utils/ir_reader/extenders/topk_extender.py
+mo/utils/ir_reader/extenders/variadic_split_extender.py
+mo/utils/ir_reader/layer_to_class.py
+mo/utils/ir_reader/restore_graph.py
+mo/utils/logger.py
+mo/utils/model_analysis.py
+mo/utils/pipeline_config.py
+mo/utils/replacement_pattern.py
+mo/utils/shape.py
+mo/utils/simple_proto_parser.py
+mo/utils/str_to.py
+mo/utils/summarize_graph.py
+mo/utils/tensorboard_util.py
+mo/utils/unsupported_ops.py
+mo/utils/utils.py
+mo/utils/version.py
+mo/utils/versions_checker.py
+mo_caffe.py
+mo_kaldi.py
+mo_mxnet.py
+mo_onnx.py
+mo_tf.py
+requirements.txt
+requirements_caffe.txt
+requirements_kaldi.txt
+requirements_mxnet.txt
+requirements_onnx.txt
+requirements_tf.txt
--- /dev/null
+import os
+import subprocess
+import tarfile
+from datetime import datetime
+from shutil import copy, copytree, rmtree
+
+
+
+class Automation:
+ @staticmethod
+ def parse_bom(bom_path):
+ files = []
+ for file in open(bom_path):
+ files.append(file)
+ return files
+
+ @staticmethod
+ def copy_files_from_bom(root_path, bom):
+ target_dir = os.path.join(os.path.dirname(__file__), "ModelOptimizerForTensorflow")
+ if os.path.exists(target_dir):
+ rmtree(target_dir)
+ os.makedirs(target_dir)
+ for file in bom:
+ src = os.path.join(root_path, file.strip('\n'))
+ dst = os.path.join(target_dir, file.strip('\n'))
+ if not os.path.exists(os.path.dirname(dst)):
+ os.makedirs(os.path.dirname(dst))
+ if os.path.isdir(src):
+ copytree(src, dst)
+ else:
+ copy(src, dst)
+ return target_dir
+
+ @staticmethod
+ def add_version_txt(dst_path, build_number):
+ timestamp = datetime.now().strftime("%I:%M%p %B %d, %Y")
+ with open(os.path.join(dst_path, "version.txt"), 'w') as f:
+ f.write(timestamp + '\n')
+ f.write(build_number + '\n')
+
+ @staticmethod
+ def make_tarfile(out_file_name, source_dir):
+ archive_path = os.path.join(os.path.dirname(__file__), out_file_name)
+ if os.path.exists(archive_path):
+ os.remove(archive_path)
+ with tarfile.open(out_file_name, "w:gz") as tar:
+ tar.add(source_dir, arcname=os.path.basename(source_dir))
limitations under the License.
"""
+import numpy as np
+
from mo.graph.graph import Graph
from mo.utils.model_analysis import AnalyzeAction
-import numpy as np
class TrainingPhaseAnalysis(AnalyzeAction):
See the License for the specific language governing permissions and
limitations under the License.
"""
-import logging as log
-
import json
+import logging as log
import sys
import numpy as np
import logging as log
from mo.graph.graph import Graph
-from mo.utils.model_analysis import AnalyzeAction, graph_contains_scope, AnalysisResults
+from mo.utils.model_analysis import AnalyzeAction, graph_contains_scope
from mo.utils.utils import files_by_pattern, get_mo_root_dir
See the License for the specific language governing permissions and
limitations under the License.
"""
+import unittest
+
import numpy as np
-import unittest
from extensions.back.CutMemory import CutMemory
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
class CutMemoryTest(unittest.TestCase):
"""
import logging as log
+
import numpy as np
from extensions.back.ForceStrictPrecision import ForceStrictPrecision
"""
import numpy as np
+
from extensions.ops.split import VariadicSplit
-from mo.front.tf.graph_utils import create_op_node_with_second_input
-from mo.front.common.partial_infer.utils import int64_array
from mo.back.replacement import BackReplacementPattern
+from mo.front.common.partial_infer.utils import int64_array
+from mo.front.tf.graph_utils import create_op_node_with_second_input
from mo.graph.graph import Graph
-from mo.ops.reshape import Reshape
from mo.ops.const import Const
+from mo.ops.reshape import Reshape
class LSTMCellNormalizer(BackReplacementPattern):
"""
import logging as log
-
import math
+
import numpy as np
-from extensions.middle.FuseReshapesSequence import FuseReshapesSequence
from extensions.back.FuseTransposesSequence import FuseTransposesSequence
+from extensions.middle.FuseReshapesSequence import FuseReshapesSequence
from extensions.middle.RemoveRedundantReshapes import RemoveRedundantReshapes
from mo.back.replacement import BackReplacementPattern
from mo.front.common.partial_infer.utils import int64_array
limitations under the License.
"""
import logging as log
+
import numpy as np
from extensions.back.ReshapeMutation import ReshapeMutation
from extensions.back.ReduceToPooling import ReduceReplacer, ReduceMerge
from mo.front.common.partial_infer.utils import int64_array
-from mo.middle.passes.eliminate import shape_inference, eliminate_dead_nodes
+from mo.middle.passes.eliminate import shape_inference
from mo.middle.passes.eliminate_test import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
import numpy as np
-from mo.back.replacement import BackReplacementPattern
from extensions.back.OptimizeTransposeReshapeSequence import set_reshape_new_output_shape
+from mo.back.replacement import BackReplacementPattern
from mo.front.common.partial_infer.utils import int64_array
from mo.graph.graph import Graph
limitations under the License.
"""
import logging as log
+
import numpy as np
from extensions.back.ForceStrictPrecision import ForceStrictPrecision
See the License for the specific language governing permissions and
limitations under the License.
"""
-import numpy as np
from mo.back.replacement import BackReplacementPattern
-from mo.graph.graph import Graph, Node
-from mo.middle.pattern_match import for_each_sub_graph_recursively
+from mo.graph.graph import Graph
class ShapeOfToShape(BackReplacementPattern):
import numpy as np
from extensions.back.ShufflenetReLUReorder import ShufflenetReLUReorder
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
# The dictionary with nodes attributes used to build various graphs. A key is the name of the node and the value is the
# dictionary with node attributes.
import numpy as np
from extensions.back.pass_separator import BackFinish
-from extensions.ops.split import Split
from extensions.ops.tensor_iterator import TensorIterator, get_internal_node_by_layer_id
from mo.back.replacement import BackReplacementPattern
-from mo.front.tf.graph_utils import create_op_node_with_second_input
from mo.graph.graph import Graph
from mo.ops.const import Const
from mo.utils.error import Error
limitations under the License.
"""
import unittest
+
import numpy as np
+
from extensions.back.SpecialNodesFinalization import CreateConstNodesReplacement
-from mo.utils.unittest.graph import build_graph_with_attrs
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph_with_attrs
class CreateConstNodesReplacementTest(unittest.TestCase):
See the License for the specific language governing permissions and
limitations under the License.
"""
-import numpy as np
from extensions.back.ConvolutionNormalizer import DeconvolutionNormalizer
from extensions.back.CropToStridedSlice import CropToStridedSlice
from extensions.back.TileNormalizer import TileMultipleAxisReplacer, Tile3DReshaper
from mo.front.common.partial_infer.utils import int64_array
from mo.ops.tile import Tile
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
class TileMultipleAxisReplacerTest(unittest.TestCase):
"""
import unittest
-import numpy as np
from argparse import Namespace
+import numpy as np
+
from extensions.back.compress_quantized_weights import CompressQuantizeWeights
from extensions.ops.fakequantize import FakeQuantize
-from mo.ops.const import Const
from mo.front.common.partial_infer.eltwise import eltwise_infer
from mo.graph.graph import Node
+from mo.ops.const import Const
from mo.utils.ir_engine.compare_graphs import compare_graphs
from mo.utils.unittest.graph import build_graph
limitations under the License.
"""
-import networkx as nx
-
from mo.back.replacement import BackReplacementPattern
from mo.graph.graph import Node, Graph
from mo.utils.error import Error
"""
import numpy as np
-import networkx as nx
-from mo.ops.const import Const
-from mo.ops.op import Op
-from mo.graph.graph import Graph
from mo.back.replacement import BackReplacementPattern
+from mo.graph.graph import Graph
+from mo.ops.const import Const
class CompatibilityL2NormalizationPattern(BackReplacementPattern):
"""
import logging as log
+
import numpy as np
from mo.front.common.replacement import FrontReplacementSubgraph
"""
import logging as log
+
import numpy as np
-from mo.front.common.replacement import FrontReplacementPattern
from mo.front.common.partial_infer.utils import int64_array
+from mo.front.common.replacement import FrontReplacementPattern
from mo.graph.graph import Graph
from mo.ops.const import Const
from mo.ops.unsqueeze import Unsqueeze
import logging as log
from math import sqrt, fabs
+
from extensions.ops.gelu import GeLUOP
from mo.front.common.replacement import FrontReplacementSubgraph
from mo.graph.graph import Graph
import numpy as np
+from extensions.ops.elementwise import Mul
from mo.front.common.replacement import FrontReplacementOp
from mo.graph.graph import Graph
-from extensions.ops.elementwise import Mul
from mo.ops.const import Const
import numpy as np
from extensions.front.LRNReplacer import LRNReplacer
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
nodes_attributes = {
'placeholder_1': {'shape': None, 'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
import numpy as np
from extensions.front.Log1p import Log1p
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
nodes_attributes = {
'placeholder': {'shape': np.array([4, 5, 6]), 'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
"""
from extensions.ops.Log import LogOp
from mo.front.common.replacement import FrontReplacementOp
-from mo.graph.graph import Graph, Node
+from mo.graph.graph import Graph, Node, rename_nodes
from mo.ops.softmax import Softmax
class LogSoftmaxFrontReplacer(FrontReplacementOp):
"""
- Replace LogSoftmax operation by Softmax -> Log.
+ Replace LogSoftmax operation with Softmax -> Log.
"""
op = "LogSoftmax"
enabled = True
def replace_op(self, graph: Graph, node: Node):
- axis = -1
- if 'axis' in node.pb.attr:
- axis = node.pb.attr['axis'].i
+ node_name = node.soft_get('name', node.id)
+ assert node.has_valid('axis'), 'The node "{}" does not have mandatory attribute "axis"'.format(node_name)
- log = LogOp(graph, {'name': node.name + '/Log_'}).create_node()
- softmax = Softmax(graph, {'axis': axis, 'name': node.name + '/SoftMax_'}).create_node()
+ log = LogOp(graph, {}).create_node()
+ softmax = Softmax(graph, {'axis': node.axis, 'name': node_name + '/Softmax'}).create_node()
+ rename_nodes([(node, node_name + '/delete'), (log, node_name)])
# Connect nodes: input -> Softmax -> Log
node.in_port(0).get_connection().set_destination(softmax.in_port(0))
log.in_port(0).get_connection().set_source(softmax.out_port(0))
-
- # The "explicit" version of the return value is: [(out_node.id, 0)])
return [log.id]
--- /dev/null
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import unittest
+
+from extensions.front.LogSoftmax import LogSoftmaxFrontReplacer
+from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph, regular_op, result, connect
+
+nodes = {
+ **regular_op('input', {'type': 'Parameter'}),
+ **regular_op('logsoftmax', {'type': None, 'op': 'LogSoftmax', 'axis': -2, 'name': 'my_logsoftmax'}),
+ **result('output'),
+}
+edges = [
+ ('input', 'logsoftmax'),
+ ('logsoftmax', 'output'),
+]
+
+
+class LogSoftmaxReplacerTest(unittest.TestCase):
+ def test_1(self):
+ graph = build_graph(nodes, edges)
+
+ graph_ref = build_graph({
+ **regular_op('input', {'type': 'Parameter'}),
+ **regular_op('log', {'op': 'Log', 'type': 'Log'}),
+ **regular_op('softmax', {'op': 'SoftMax', 'type': 'SoftMax', 'axis': -2}),
+ **result('output'),
+ },
+ [
+ ('input', 'softmax'),
+ ('softmax', 'log'),
+ ('log', 'output'),
+ ])
+
+ graph.graph['layout'] = 'NCHW'
+ graph.stage = 'front'
+
+ LogSoftmaxFrontReplacer().find_and_replace_pattern(graph)
+
+ (flag, resp) = compare_graphs(graph, graph_ref, 'output', check_op_attrs=True)
+ self.assertTrue(flag, resp)
+ self.assertTrue(graph.get_op_nodes(op='Log')[0].name == 'my_logsoftmax')
+
+ def test_2(self):
+ graph = build_graph(nodes, edges)
+
+ graph_ref = build_graph({
+ **regular_op('input', {'type': 'Parameter'}),
+ **regular_op('log', {'op': 'Log', 'type': 'Log'}),
+ **regular_op('softmax', {'op': 'SoftMax', 'type': 'SoftMax', 'axis': -2}),
+ **result('output'),
+ },
+ [
+ ('input', 'softmax'),
+ ('softmax', 'log'),
+ ('log', 'output'),
+ ])
+
+ graph.graph['layout'] = 'NHWC'
+ graph.stage = 'front'
+
+ LogSoftmaxFrontReplacer().find_and_replace_pattern(graph)
+
+ (flag, resp) = compare_graphs(graph, graph_ref, 'output', check_op_attrs=True)
+ self.assertTrue(flag, resp)
+ self.assertTrue(graph.get_op_nodes(op='Log')[0].name == 'my_logsoftmax')
limitations under the License.
"""
import math
+
import numpy as np
from extensions.ops.MatMul import MatMul
import unittest
import numpy as np
+from generator import generator, generate
from extensions.front.Pack import Pack
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
-
-from generator import generator, generate
+from mo.utils.unittest.graph import build_graph
nodes_attributes = {
'placeholder_0': {'shape': None, 'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
import logging as log
from mo.front.common.partial_infer.utils import int64_array
-from mo.graph.graph import Graph
from mo.front.common.replacement import FrontReplacementPattern
+from mo.graph.graph import Graph
from mo.ops.const import Const
from mo.utils.error import Error
from mo.front.common.replacement import FrontReplacementPattern
from mo.graph.graph import Graph
from mo.ops.const import Const
-from mo.ops.result import Result
class TopKNormalize(FrontReplacementPattern):
import numpy as np
+from extensions.ops.elementwise import Add, Mul
from mo.front.common.replacement import FrontReplacementPattern
from mo.graph.graph import Graph
from mo.ops.const import Const
-from extensions.ops.elementwise import Add, Mul
class BinaryFakeQuantizeNormalization(FrontReplacementPattern):
import numpy as np
from extensions.front.binary_quantize_normalization import BinaryFakeQuantizeNormalization
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
graph_nodes = {
'0': {'name': 'input', 'kind': 'op', 'op': 'Parameter'},
from extensions.front.caffe.accum_ext import AccumFrontExtractor
from extensions.ops.accum import AccumOp
+from mo.ops.op import Op
from mo.utils.unittest.extractors import FakeMultiParam
from mo.utils.unittest.graph import FakeNode
-from mo.ops.op import Op
class FakeAccumProtoLayer:
from extensions.front.caffe.argmax_ext import ArgMaxFrontExtractor
from extensions.ops.argmax import ArgMaxOp
+from mo.ops.op import Op
from mo.utils.unittest.extractors import FakeMultiParam
from mo.utils.unittest.graph import FakeNode
-from mo.ops.op import Op
class FakeArgMaxProtoLayer:
limitations under the License.
"""
+from extensions.ops.elementwise import Add
from mo.front.common.replacement import FrontReplacementOp
from mo.graph.graph import Node, Graph
-from extensions.ops.elementwise import Add
from mo.ops.scale_shift import ScaleShiftOp
from extensions.front.caffe.bn import BNToScaleShift
from mo.graph.graph import Node
+from mo.utils.ir_engine.compare_graphs import compare_graphs
from mo.utils.unittest.extractors import FakeParam
from mo.utils.unittest.graph import build_graph_with_edge_attrs, build_graph_with_attrs
-from mo.utils.ir_engine.compare_graphs import compare_graphs
class FakeBNProtoLayer:
from extensions.front.caffe.correlation_ext import CorrelationFrontExtractor
from extensions.ops.correlation import CorrelationOp
+from mo.ops.op import Op
from mo.utils.unittest.extractors import FakeMultiParam
from mo.utils.unittest.graph import FakeNode
-from mo.ops.op import Op
class FakeCorrProtoLayer:
from extensions.front.caffe.ctcgreedydecoder_ext import CTCGreedyDecoderFrontExtractor
from extensions.ops.ctc_greedy_decoder import CTCGreedyDecoderOp
+from mo.ops.op import Op
from mo.utils.unittest.extractors import FakeMultiParam
from mo.utils.unittest.graph import FakeNode
-from mo.ops.op import Op
class FakeCTCGreedyDecoderProtoLayer:
from extensions.front.caffe.data_augmentation_ext import DataAugmentationFrontExtractor
from extensions.ops.data_augmentation import DataAugmentationOp
+from mo.ops.op import Op
from mo.utils.unittest.extractors import FakeMultiParam
from mo.utils.unittest.graph import FakeNode
-from mo.ops.op import Op
class FakeDAProtoLayer:
limitations under the License.
"""
+from extensions.ops.activation_ops import Elu
from mo.front.caffe.collect_attributes import collect_attributes
from mo.front.extractor import FrontExtractorOp
-from extensions.ops.activation_ops import Elu
class ELUFrontExtractor(FrontExtractorOp):
"""
import unittest
-
from unittest.mock import patch
from extensions.front.caffe.elu import ELUFrontExtractor
from extensions.front.caffe.grn_ext import GRNFrontExtractor
from extensions.ops.grn import GRNOp
-from mo.utils.unittest.extractors import FakeMultiParam
-from mo.utils.unittest.graph import FakeNode
from mo.front.common.partial_infer.elemental import copy_shape_infer
from mo.ops.op import Op
+from mo.utils.unittest.extractors import FakeMultiParam
+from mo.utils.unittest.graph import FakeNode
class FakeGRNProtoLayer:
from extensions.front.caffe.normalize_ext import NormalizeFrontExtractor
from extensions.ops.normalize import NormalizeOp
+from mo.ops.op import Op
from mo.utils.unittest.extractors import FakeMultiParam
from mo.utils.unittest.graph import FakeNode
-from mo.ops.op import Op
class FakeNormalizeProtoLayer:
from extensions.front.caffe.power_file_ext import PowerFileFrontExtractor
from extensions.ops.power_file import PowerFileOp
-from mo.utils.unittest.extractors import FakeMultiParam
-from mo.utils.unittest.graph import FakeNode
from mo.front.common.partial_infer.elemental import copy_shape_infer
from mo.ops.op import Op
+from mo.utils.unittest.extractors import FakeMultiParam
+from mo.utils.unittest.graph import FakeNode
class FakePowerFileProtoLayer:
from extensions.front.caffe.prelu_ext import PreluFrontExtractor
from extensions.ops.prelu import PreluOp
+from mo.ops.op import Op
from mo.utils.unittest.extractors import FakeMultiParam
from mo.utils.unittest.graph import FakeNode
-from mo.ops.op import Op
class FakePReLUProtoLayer:
from extensions.front.caffe.priorbox_clustered_ext import PriorBoxClusteredFrontExtractor
from extensions.ops.priorbox_clustered import PriorBoxClusteredOp
+from mo.ops.op import Op
from mo.utils.unittest.extractors import FakeMultiParam
from mo.utils.unittest.graph import FakeNode
-from mo.ops.op import Op
class FakePriorBoxClusteredProtoLayer:
from extensions.front.caffe.priorbox_ext import PriorBoxFrontExtractor
from extensions.ops.priorbox import PriorBoxOp
+from mo.ops.op import Op
from mo.utils.unittest.extractors import FakeMultiParam, FakeParam
from mo.utils.unittest.graph import FakeNode
-from mo.ops.op import Op
class FakeMultiParamListFields(FakeMultiParam):
from extensions.front.caffe.proposal_ext import ProposalFrontExtractor
from extensions.ops.proposal import ProposalOp
+from mo.ops.op import Op
from mo.utils.unittest.extractors import FakeMultiParam
from mo.utils.unittest.graph import FakeNode, FakeAttr
-from mo.ops.op import Op
class FakeProposalProtoLayer:
"""
import unittest
-from unittest.mock import patch
from extensions.front.caffe.proposal_python_ext import ProposalPythonFrontExtractor
from extensions.ops.proposal import ProposalOp
+from mo.ops.op import Op
from mo.utils.unittest.extractors import FakeMultiParam
from mo.utils.unittest.graph import FakeNode, FakeAttr
-from mo.ops.op import Op
class FakeProposalPythonProtoLayer:
from extensions.front.caffe.regionyolo_ext import RegionYoloFrontExtractor
from extensions.ops.regionyolo import RegionYoloOp
+from mo.ops.op import Op
from mo.utils.unittest.extractors import FakeMultiParam
from mo.utils.unittest.graph import FakeNode
-from mo.ops.op import Op
class FakeRegionYoloProtoLayer:
limitations under the License.
"""
-from mo.front.extractor import FrontExtractorOp
from extensions.ops.activation_ops import ReLU6
+from mo.front.extractor import FrontExtractorOp
class ReLU6FrontExtractor(FrontExtractorOp):
from extensions.front.caffe.reorgyolo_ext import ReorgYoloFrontExtractor
from extensions.ops.reorgyolo import ReorgYoloOp
+from mo.ops.op import Op
from mo.utils.unittest.extractors import FakeMultiParam
from mo.utils.unittest.graph import FakeNode
-from mo.ops.op import Op
class FakeReorgYoloProtoLayer:
from extensions.front.caffe.simplernms_ext import SimplerNMSFrontExtractor
from extensions.ops.simplernms import SimplerNMSOp
+from mo.ops.op import Op
from mo.utils.unittest.extractors import FakeMultiParam
from mo.utils.unittest.graph import FakeNode
-from mo.ops.op import Op
class FakeSimplerNMSProtoLayer:
from extensions.front.caffe.spatial_transformer_ext import SpatialTransformFrontExtractor
from extensions.ops.spatial_transformer import SpatialTransformOp
+from mo.ops.op import Op
from mo.utils.unittest.extractors import FakeMultiParam
from mo.utils.unittest.graph import FakeNode
-from mo.ops.op import Op
class FakeSpatialTransformProtoLayer:
limitations under the License.
"""
-from mo.front.extractor import FrontExtractorOp
from extensions.ops.activation_ops import Tanh
+from mo.front.extractor import FrontExtractorOp
class TanhFrontExtractor(FrontExtractorOp):
"""
+from extensions.ops.elementwise import Add, Maximum, Mul
from mo.front.common.replacement import FrontReplacementOp
from mo.graph.graph import Node, Graph
-from extensions.ops.elementwise import Add, Maximum, Mul
class EltwiseNReplacement(FrontReplacementOp):
import numpy as np
from extensions.front.eltwise_n import EltwiseNReplacement
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
nodes_attributes = {
'placeholder_1': {'shape': None, 'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
from extensions.ops.rank import Rank
from mo.front.common.partial_infer.utils import int64_array
from mo.front.common.replacement import FrontReplacementPattern
-from mo.front.tf.graph_utils import create_op_node_with_second_input
from mo.graph.graph import Graph
from mo.ops.const import Const
-from mo.ops.unsqueeze import Unsqueeze
class GlobalPoolingToReduce(FrontReplacementPattern):
import numpy as np
+from extensions.ops.elementwise import Mul, Add
from mo.front.common.replacement import FrontReplacementOp
from mo.graph.graph import Graph
from mo.ops.const import Const
-from extensions.ops.elementwise import Mul, Add
class ImageScaler(FrontReplacementOp):
import numpy as np
from extensions.front.image_scaler import ImageScaler
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
nodes_attributes = {
'placeholder_1': {'shape': None, 'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
limitations under the License.
"""
-from mo.front.common.replacement import FrontReplacementOp
-from mo.graph.graph import Node, Graph
from extensions.ops.elementwise import Add, Mul
from extensions.ops.mvn import MVN
+from mo.front.common.replacement import FrontReplacementOp
+from mo.graph.graph import Node, Graph
class InstanceNormalization(FrontReplacementOp):
import networkx as nx
from extensions.front.instance_normalization import InstanceNormalization
-from mo.utils.unittest.graph import build_graph
from mo.middle.pattern_match import node_match
+from mo.utils.unittest.graph import build_graph
nodes_attributes = {
'input': {'kind': 'op', 'op': 'AnyOp'},
import unittest
import numpy as np
+
from extensions.front.kaldi.apply_counts import apply_biases_to_last_layer
from mo.utils.ir_engine.compare_graphs import compare_graphs
from mo.utils.unittest.graph import build_graph
+++ /dev/null
-"""
- Copyright (C) 2018-2020 Intel Corporation
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-from extensions.ops.Log import LogOp
-from mo.front.common.replacement import FrontReplacementOp
-from mo.graph.graph import Graph, Node
-from mo.ops.softmax import Softmax
-
-
-class LogsoftmaxFrontReplacer(FrontReplacementOp):
- """
- Replace LogSoftmax operation by Softmax -> Log.
- """
- op = "logsoftmaxcomponent"
- enabled = True
-
- def replace_op(self, graph: Graph, node: Node):
- log = LogOp(graph, {'name': node.name + '/Log_'}).create_node()
- softmax = Softmax(graph, {'axis': 1, 'name': node.name + '/SoftMax_'}).create_node()
-
- # Connect nodes: input -> Softmax -> Log
- node.in_port(0).get_connection().set_destination(softmax.in_port(0))
- log.in_port(0).get_connection().set_source(softmax.out_port(0))
-
- # The "explicit" version of the return value is: [(out_node.id, 0)])
- return [log.id]
--- /dev/null
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.ops.softmax import LogSoftmax
+from mo.front.extractor import FrontExtractorOp
+
+
+class LogSoftMaxComponentExtractor(FrontExtractorOp):
+ op = 'logsoftmaxcomponent'
+ enabled = True
+
+ @classmethod
+ def extract(cls, node):
+ LogSoftmax.update_node_stat(node, {'axis': 1})
+ return cls.enabled
from mo.front.common.replacement import FrontReplacementOp
from mo.front.tf.graph_utils import create_op_with_const_inputs
from mo.graph.graph import Node, Graph
-from mo.ops.const import Const
from mo.ops.eltwise import Eltwise
from mo.ops.eltwise_n import EltwiseN
from mo.utils.error import Error
"""
from extensions.front.kaldi.sigmoid_ext import SigmoidFrontExtractor
-from mo.front.kaldi.extractors.common_ext_test import KaldiFrontExtractorTest
from extensions.ops.activation_ops import Sigmoid
+from mo.front.kaldi.extractors.common_ext_test import KaldiFrontExtractorTest
from mo.ops.op import Op
limitations under the License.
"""
-from mo.front.kaldi.extractors.common_ext_test import KaldiFrontExtractorTest
from extensions.front.kaldi.tanh_component_ext import TanhFrontExtractor
from extensions.ops.activation_ops import Tanh
+from mo.front.kaldi.extractors.common_ext_test import KaldiFrontExtractorTest
from mo.ops.op import Op
"""
import unittest
+from argparse import Namespace
import numpy as np
-from argparse import Namespace
-from mo.graph.graph import Node
from extensions.front.mxnet.add_input_data_to_prior_boxes import AddInputDataToPriorBoxes
+from mo.graph.graph import Node
from mo.utils.unittest.graph import build_graph
import unittest
from extensions.front.mxnet.check_softmax_node_inputs import CheckSoftmaxNodeInputs
-from mo.utils.unittest.graph import build_graph
from mo.graph.graph import Node
+from mo.utils.unittest.graph import build_graph
class TestCheckSoftmaxNodeInputs(unittest.TestCase):
from mo.front.extractor import FrontExtractorOp
from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
from mo.ops.convolution import Convolution
-from mo.front.common.extractors.utils import layout_attrs
+
class ConvFrontExtractor(FrontExtractorOp):
op = 'Convolution'
import unittest
from extensions.front.mxnet.custom import CustomFrontExtractorOp
-from mo.utils.unittest.graph import build_graph
from mo.front.extractor import FrontExtractorOp, MXNetCustomFrontExtractorOp
from mo.graph.graph import Node
+from mo.utils.unittest.graph import build_graph
attrs = {'test_attr': 1}
"""
import numpy as np
-from extensions.ops.elementwise import Mul, Sub, Add, Maximum, Minimum, Div, Greater, GreaterEqual, Equal, Less, LessEqual, Pow, NotEqual, LogicalAnd, LogicalOr
+from extensions.ops.elementwise import Mul, Sub, Add, Maximum, Minimum, Div, Greater, GreaterEqual, Equal, Less, \
+ LessEqual, Pow, NotEqual, LogicalAnd, LogicalOr
from mo.front.extractor import FrontExtractorOp
from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
from mo.graph.graph import Node
import numpy as np
from extensions.front.mxnet.gather import GatherFrontReplacer
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
class GatherTest(unittest.TestCase):
limitations under the License.
"""
-from mo.graph.graph import Node
from extensions.ops.instance_normalization import InstanceNormalization
from mo.front.extractor import FrontExtractorOp
from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
+from mo.graph.graph import Node
class InstanceNormFrontExtractor(FrontExtractorOp):
limitations under the License.
"""
+from extensions.ops.DetectionOutput import DetectionOutput
from mo.front.extractor import FrontExtractorOp
from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
-from extensions.ops.DetectionOutput import DetectionOutput
class MultiBoxDetectionOutputExtractor(FrontExtractorOp):
op = '_contrib_MultiBoxDetection'
from mo.front.common.replacement import FrontReplacementOp
from mo.front.tf.graph_utils import create_op_node_with_second_input
from mo.graph.graph import Graph
-from mo.ops.const import Const
from mo.ops.reshape import Reshape
from mo.ops.shape import Shape
from mo.ops.squeeze import Squeeze
limitations under the License.
"""
-import numpy as np
-
from extensions.ops.parameter import Parameter
-from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
from mo.front.extractor import FrontExtractorOp
from mo.ops.const import Const
-from mo.ops.pad import Pad
class NullFrontExtractor(FrontExtractorOp):
import numpy as np
-from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
from mo.front.extractor import FrontExtractorOp
+from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
from mo.ops.pad import AttributedPad
limitations under the License.
"""
-from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
from mo.front.extractor import FrontExtractorOp
+from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
from mo.ops.crop import Crop
import numpy as np
-from mo.graph.graph import Graph
from extensions.ops.elementwise import Mul
-from mo.ops.const import Const
from mo.front.common.replacement import FrontReplacementSubgraph
+from mo.graph.graph import Graph
+from mo.ops.const import Const
class SoftmaxFrontReplacementSubgraph(FrontReplacementSubgraph):
limitations under the License.
"""
-from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
from mo.front.extractor import FrontExtractorOp
+from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
from mo.ops.softmax import Softmax
limitations under the License.
"""
-from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
from mo.front.extractor import FrontExtractorOp
+from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
from mo.ops.softmax import Softmax
limitations under the License.
"""
-from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
from mo.front.extractor import FrontExtractorOp
+from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
from mo.ops.softmax import Softmax
limitations under the License.
"""
-from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
from mo.front.extractor import FrontExtractorOp
+from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
from mo.ops.squeeze import Squeeze
from mo.front.tf.graph_utils import create_op_node_with_second_input
from mo.graph.graph import Node, Graph
from mo.middle.pattern_match import find_pattern_matches
-from mo.ops.result import Result
from mo.ops.reshape import Reshape
+from mo.ops.result import Result
class SsdPatternDetectionOutputReplacer(FrontReplacementSubgraph):
import unittest
from extensions.front.mxnet.ssd_pattern_flatten_softmax_activation import SsdPatternFlattenSoftmaxActivation
-from mo.utils.unittest.graph import build_graph
from mo.graph.graph import Node
+from mo.utils.unittest.graph import build_graph
class TestSsdPatternFlattenSoftmaxActivation(unittest.TestCase):
import unittest
from extensions.front.mxnet.ssd_pattern_remove_flatten import SsdPatternRemoveFlatten
-from mo.utils.unittest.graph import build_graph
from mo.graph.graph import Node
+from mo.utils.unittest.graph import build_graph
class TestSsdPatternRemoveFlatten(unittest.TestCase):
import unittest
from extensions.front.mxnet.ssd_pattern_remove_reshape import SsdPatternRemoveReshape
-from mo.utils.unittest.graph import build_graph
from mo.graph.graph import Node
+from mo.utils.unittest.graph import build_graph
class TestSsdPatternRemoveReshape(unittest.TestCase):
limitations under the License.
"""
-import networkx as nx
-
from extensions.front.mxnet.ssd_pattern_flatten_softmax_activation import SsdPatternFlattenSoftmaxActivation
from extensions.front.mxnet.ssd_pattern_remove_flatten import SsdPatternRemoveFlatten
from extensions.front.mxnet.ssd_pattern_remove_reshape import SsdPatternRemoveReshape
import unittest
from extensions.front.mxnet.ssd_pattern_remove_transpose import SsdPatternRemoveTranspose
-from mo.utils.unittest.graph import build_graph
from mo.graph.graph import Node
+from mo.utils.unittest.graph import build_graph
class TestSsdPatternRemoveTranspose(unittest.TestCase):
limitations under the License.
"""
-import networkx as nx
-
-from mo.graph.graph import Graph
-from mo.front.common.replacement import FrontReplacementPattern
-from extensions.front.mxnet.ssd_pattern_remove_transpose import SsdPatternRemoveTranspose
from extensions.front.mxnet.ssd_pattern_flatten_softmax_activation import SsdPatternFlattenSoftmaxActivation
+from extensions.front.mxnet.ssd_pattern_remove_transpose import SsdPatternRemoveTranspose
+from mo.front.common.replacement import FrontReplacementPattern
+from mo.graph.graph import Graph
class SsdReorderDetectionOutInputs(FrontReplacementPattern):
import numpy as np
from extensions.front.mxnet.ssd_reorder_detection_out_inputs import SsdReorderDetectionOutInputs
-from mo.utils.unittest.graph import build_graph
from mo.graph.graph import Node
+from mo.utils.unittest.graph import build_graph
class TestSsdReorderDetectionOutInputs(unittest.TestCase):
limitations under the License.
"""
+from extensions.ops.pack import PackOp
from mo.front.extractor import FrontExtractorOp
from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
-from extensions.ops.pack import PackOp
class StackFrontExtractor(FrontExtractorOp):
limitations under the License.
"""
-import ast
import numpy as np
-from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
from mo.front.extractor import FrontExtractorOp
+from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
from mo.ops.const import Const
import logging as log
-import networkx as nx
-
from mo.front.common.replacement import FrontReplacementSubgraph
from mo.graph.graph import Graph
import onnx
from extensions.front.onnx.affine_ext import AffineFrontExtractor
-from mo.utils.unittest.graph import build_graph
from mo.graph.graph import Node
+from mo.utils.unittest.graph import build_graph
class AffineONNXExtractorTest(unittest.TestCase):
limitations under the License.
"""
-import numpy as np
-
from extensions.ops.constant_fill import ConstantFill
from mo.front.extractor import FrontExtractorOp
from mo.front.onnx.extractors.utils import onnx_attr
import numpy as np
-from mo.front.common.extractors.utils import layout_attrs
+from mo.front.common.partial_infer.utils import int64_array
from mo.front.extractor import FrontExtractorOp
from mo.front.onnx.extractors.utils import onnx_attr, get_onnx_autopad
from mo.ops.convolution import Convolution
from mo.utils.error import Error
-from mo.front.common.partial_infer.utils import int64_array
class ConvFrontExtractor(FrontExtractorOp):
import onnx
from extensions.front.onnx.conv_ext import ConvTransposeFrontExtractor
-from mo.utils.unittest.graph import build_graph
from mo.graph.graph import Node
from mo.utils.error import Error
+from mo.utils.unittest.graph import build_graph
class ConvTransposeONNXExtractorTest(unittest.TestCase):
import onnx
from extensions.front.onnx.crop_ext import CropFrontExtractor
-from mo.utils.unittest.graph import build_graph
from mo.graph.graph import Node
+from mo.utils.unittest.graph import build_graph
class CropONNXExtractorTest(unittest.TestCase):
from mo.front.extractor import FrontExtractorOp
from mo.front.onnx.extractors.utils import onnx_attr, get_onnx_autopad
from mo.ops.deformable_convolution import DeformableConvolution
-from mo.utils.error import Error
-from mo.front.common.partial_infer.utils import int64_array
class DeformableConvExtractor(FrontExtractorOp):
limitations under the License.
"""
-import onnx
import unittest
import numpy as np
+import onnx
from extensions.front.onnx.detection_output import DetectionOutputFrontExtractor
from extensions.ops.DetectionOutput import DetectionOutput
"""
from math import log
+
import numpy as np
from extensions.ops.detectionoutput_onnx import ExperimentalDetectronDetectionOutput
limitations under the License.
"""
+from extensions.ops.identity import IdentityOp
from mo.front.extractor import FrontExtractorOp
from mo.front.onnx.extractors.utils import onnx_attr
-from extensions.ops.identity import IdentityOp
from mo.utils.error import Error
import numpy as np
from mo.front.extractor import FrontExtractorOp
-from mo.ops.op import Op
-
from mo.front.onnx.extractors.utils import onnx_attr
from mo.graph.graph import Node
from mo.ops.reshape import Reshape
-
input_fpn_heads = ('486', '454', '422', '390')
--- /dev/null
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+import numpy as np
+
+from extensions.ops.non_zero import NonZero
+from mo.front.extractor import FrontExtractorOp
+
+
+class NonZeroExtractor(FrontExtractorOp):
+ op = 'NonZero'
+ enabled = True
+
+ @classmethod
+ def extract(cls, node):
+ NonZero.update_node_stat(node, {'output_type': np.int64})
+ return cls.enabled
--- /dev/null
+[
+ {
+ "custom_attributes":
+ {
+ "fpn_heads": ["634", "635", "636", "637"],
+ "ROI_feature_extractor_inputs": ["2475", "2834", "3192"],
+ "ROI_feature_extractor_outputs": ["2614", "2972", "3330"]
+ },
+ "id": "ONNXPersonDetectionCrossroadReplacement",
+ "match_kind": "general"
+ }
+]
--- /dev/null
+"""
+ Copyright (c) 2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from extensions.ops.roifeatureextractor_onnx import ExperimentalDetectronROIFeatureExtractor
+from mo.front.common.partial_infer.utils import int64_array
+from mo.front.tf.replacement import FrontReplacementFromConfigFileGeneral
+from mo.graph.graph import Graph, Node, rename_node
+
+
+class ONNXPersonDetectionCrossroadReplacement(FrontReplacementFromConfigFileGeneral):
+ """
+ Insert ExperimentalDetectronROIFeatureExtractor layers instead of sub-graphs of the model.
+ """
+ replacement_id = 'ONNXPersonDetectionCrossroadReplacement'
+
+ def transform_graph(self, graph: Graph, replacement_descriptions: dict):
+ fpn_heads = replacement_descriptions['fpn_heads']
+ for inp, out in zip(replacement_descriptions['ROI_feature_extractor_inputs'],
+ replacement_descriptions['ROI_feature_extractor_outputs']):
+ insert_experimental_layers(graph, fpn_heads, inp, out)
+
+
+def insert_experimental_layers(graph: Graph, input_fpn_heads: list, inp: str, out: str):
+ old_output_node = Node(graph, out)
+ output_name = old_output_node.soft_get('name', old_output_node.id)
+ old_output_node_name = output_name + '/old'
+ rename_node(old_output_node, old_output_node_name)
+
+ input_fpn_head_nodes = [Node(graph, node_id) for node_id in input_fpn_heads]
+ fpn_roi_align = ExperimentalDetectronROIFeatureExtractor(graph, {'name': output_name,
+ 'distribute_rois_between_levels': 1,
+ 'image_id': 0,
+ 'output_size': 7,
+ 'preserve_rois_order': 1,
+ 'pyramid_scales': int64_array(
+ [4, 8, 16, 32, 64]),
+ 'sampling_ratio': 2, }).create_node()
+ rename_node(fpn_roi_align, output_name)
+ fpn_roi_align.in_port(0).connect(Node(graph, inp).out_port(0))
+ for ind, fpn_node in enumerate(input_fpn_head_nodes):
+ fpn_roi_align.in_port(ind + 1).connect(fpn_node.out_port(0))
+
+ old_output_node.out_port(0).get_connection().set_source(fpn_roi_align.out_port(0))
def common_onnx_pool_extractor(node):
+ kernel_shape = onnx_attr(node, 'kernel_shape', 'ints', default=None, dst_type=lambda x: np.array(x, dtype=np.int64))
+ final_kernel_shape = np.array([1, 1, *[x for x in kernel_shape]], dtype=np.int64) if kernel_shape is not None else None
+
pads = onnx_attr(node, 'pads', 'ints', default=None, dst_type=lambda x: np.array(x, dtype=np.int64))
- # Try to convert slightly incorrect models with insufficient pad parameters
- if pads is not None and (pads.size == 2 or pads.size % 2 != 0):
- log.warning(
- 'Node {} has pad = {} which is ill-formed -- it should consist of N%2==0 elements.'.format(node.name,
- pads))
+ if kernel_shape is not None and pads is not None and kernel_shape.size * 2 != pads.size:
+ log.warning('Node {} has pad = {} which is ill-formed -- it should have even amount of elements.'.format(
+ node.soft_get('name', node.id), pads))
+
+ # Try to convert slightly incorrect models with insufficient pad parameters
+ assert pads.size * 2 == kernel_shape.size
pads = np.concatenate([pads, pads])
log.warning('Extended pads to {}'.format(pads))
final_strides = np.array([1, 1, *[x for x in strides]], dtype=np.int64) if strides is not None else None
dilations = onnx_attr(node, 'dilations', 'ints', default=None, dst_type=lambda x: np.array(x, dtype=np.int64))
- assert dilations is None, 'dilations attribute is not supported in node {}'.format(node.id)
-
- kernel_shape = onnx_attr(node, 'kernel_shape', 'ints', default=None, dst_type=lambda x: np.array(x, dtype=np.int64))
- final_kernel_shape = np.array([1, 1, *[x for x in kernel_shape]], dtype=np.int64) if kernel_shape is not None else None
+ assert dilations is None or np.all(dilations == 1),\
+ 'Node {} has "dilations" attribute with values not equal to 1s which is not supported'.format(node.id)
# exclude_pad = True only when count_include_pad == 0
exclude_pad = onnx_attr(node, 'count_include_pad', 'i', default=0) == 0
limitations under the License.
"""
-import onnx
import unittest
import numpy as np
+import onnx
from extensions.front.onnx.priorbox_clustered_ext import PriorBoxClusteredFrontExtractor
from extensions.ops.priorbox_clustered import PriorBoxClusteredOp
limitations under the License.
"""
-import onnx
import unittest
import numpy as np
+import onnx
from extensions.front.onnx.priorbox_ext import PriorBoxFrontExtractor
from extensions.ops.priorbox import PriorBoxOp
limitations under the License.
"""
+from extensions.ops.fakequantize import FakeQuantize
from mo.front.extractor import FrontExtractorOp
from mo.front.onnx.extractors.utils import onnx_attr
-from extensions.ops.fakequantize import FakeQuantize
class FakeQuantizeFrontExtractor(FrontExtractorOp):
--- /dev/null
+"""
+ Copyright (c) 2019 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from extensions.ops.reverse_sequence import ReverseSequence
+from mo.front.extractor import FrontExtractorOp
+from mo.front.onnx.extractors.utils import onnx_attr
+
+
+class ReverseSequenceExtractor(FrontExtractorOp):
+ op = 'ReverseSequence'
+ enabled = True
+
+ @staticmethod
+ def extract(node):
+ batch_axis = onnx_attr(node, 'batch_axis', 'i', default=1)
+ time_axis = onnx_attr(node, 'time_axis', 'i', default=0)
+
+ attrs = {
+ 'batch_axis': batch_axis,
+ 'seq_axis': time_axis,
+ }
+ ReverseSequence.update_node_stat(node, attrs)
+ return __class__.enabled
--- /dev/null
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+from extensions.ops.roialign import ROIAlign
+from mo.front.extractor import FrontExtractorOp
+from mo.front.onnx.extractors.utils import onnx_attr
+
+
+class ROIAlignExtractor(FrontExtractorOp):
+ op = 'ROIAlign'
+ enabled = True
+
+ @classmethod
+ def extract(cls, node):
+ mode = onnx_attr(node, 'mode', 's', default=b'avg').decode()
+ output_height = onnx_attr(node, 'output_height', 'i', default=1)
+ output_width = onnx_attr(node, 'output_width', 'i', default=1)
+ sampling_ratio = onnx_attr(node, 'sampling_ratio', 'i', default=0)
+ spatial_scale = onnx_attr(node, 'spatial_scale', 'f', default=1.0)
+
+ ROIAlign.update_node_stat(node, {'pooled_h': output_height, 'pooled_w': output_width,
+ 'sampling_ratio': sampling_ratio, 'spatial_scale': spatial_scale,
+ 'mode': mode})
+ return cls.enabled
--- /dev/null
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+from extensions.ops.scatter import ScatterElementsUpdate
+from mo.front.extractor import FrontExtractorOp
+from mo.front.onnx.extractors.utils import onnx_attr
+
+
+class ScatterExtractor(FrontExtractorOp):
+ # deprecated ONNX operation
+ op = 'Scatter'
+ enabled = True
+
+ @classmethod
+ def extract(cls, node):
+ axis = onnx_attr(node, 'axis', 'i', default=0)
+ ScatterElementsUpdate.update_node_stat(node, {'axis': axis})
+ return cls.enabled
+
+
+class ScatterElementsExtractor(FrontExtractorOp):
+ op = 'ScatterElements'
+ enabled = True
+
+ @classmethod
+ def extract(cls, node):
+ axis = onnx_attr(node, 'axis', 'i', default=0)
+ ScatterElementsUpdate.update_node_stat(node, {'axis': axis})
+ return cls.enabled
from mo.front.extractor import FrontExtractorOp
from mo.front.onnx.extractors.utils import onnx_attr
-from mo.ops.softmax import Softmax
+from mo.ops.softmax import LogSoftmax, Softmax
-class SoftmaxFrontExtractor(FrontExtractorOp):
+class SoftmaxExtractor(FrontExtractorOp):
op = 'Softmax'
enabled = True
@classmethod
def extract(cls, node):
axis = onnx_attr(node, 'axis', 'i', default=1)
+ Softmax.update_node_stat(node, {'axis': axis})
+ return cls.enabled
+
- attrs = {
- 'axis': axis
- }
+class LogSoftmaxExtractor(FrontExtractorOp):
+ op = 'LogSoftmax'
+ enabled = True
- # update the attributes of the node
- Softmax.update_node_stat(node, attrs)
+ @classmethod
+ def extract(cls, node):
+ axis = onnx_attr(node, 'axis', 'i', default=1)
+ LogSoftmax.update_node_stat(node, {'axis': axis})
return cls.enabled
import numpy as np
-from mo.ops.squeeze import Squeeze
from mo.front.extractor import FrontExtractorOp
from mo.front.onnx.extractors.utils import onnx_attr
+from mo.ops.squeeze import Squeeze
class SqueezeFrontExtractor(FrontExtractorOp):
from generator import generator, generate
from extensions.front.onnx.transpose_ext import TransposeFrontExtractor
-from mo.ops.op import Op
from extensions.ops.transpose import Transpose
+from mo.ops.op import Op
from mo.utils.unittest.extractors import PB
import numpy as np
from extensions.front.reciprocal import ReciprocalReplacer
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
nodes_attributes = {
'placeholder_1': {'shape': None, 'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
--- /dev/null
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import numpy as np
+
+from mo.front.common.replacement import FrontReplacementPattern
+from mo.graph.graph import Graph
+from mo.ops.const import Const
+
+
+class ScatterNormalizer(FrontReplacementPattern):
+ enabled = True
+
+ def find_and_replace_pattern(self, graph: Graph):
+ for node in graph.get_op_nodes(is_scatter=True):
+ name = node.soft_get('name', node.id)
+ input_ports_count = len([port for port in node.in_ports().values() if not port.disconnected()])
+ has_axis = node.has_valid('axis')
+
+ if has_axis:
+ assert input_ports_count == 3, \
+ '{} node {} has unexpected number of input ports {}'.format(node.op, name, input_ports_count)
+ const = Const(graph, {'name': name + '/axis', 'value': np.int64(node.axis)}).create_node()
+ node.add_input_port(3, skip_if_exist=True)
+ node.in_port(3).connect(const.out_port(0))
+ del node['axis']
+ else:
+ assert input_ports_count == 4, \
+ '{} node {} has unexpected number of input ports {}'.format(node.op, name, input_ports_count)
--- /dev/null
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+from extensions.front.scatter_normalizer import ScatterNormalizer
+from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph, result, connect, \
+ regular_op_with_empty_data
+
+nodes = {
+ **regular_op_with_empty_data('placeholder_1', {'type': 'Parameter'}),
+ **regular_op_with_empty_data('placeholder_2', {'type': 'Parameter'}),
+ **regular_op_with_empty_data('placeholder_3', {'type': 'Parameter'}),
+ **regular_op_with_empty_data('node', {'op': 'ScatterElementsUpdate', 'is_scatter': True}),
+ **regular_op_with_empty_data('axis', {'type': 'Const', 'value': None}),
+ **result(),
+}
+
+edges = [
+ *connect('placeholder_1', '0:node'),
+ *connect('placeholder_2', '1:node'),
+ *connect('placeholder_3', '2:node'),
+ *connect('node', 'output'),
+]
+
+
+class TestDiv(unittest.TestCase):
+ def test_ScatterElementsUpdate_has_axis_and_3_inputs(self):
+ graph = build_graph(nodes, edges, {'node': {'axis': 1}}, nodes_with_edges_only=True)
+ ScatterNormalizer().find_and_replace_pattern(graph)
+
+ graph_ref = build_graph(nodes, [
+ *edges,
+ *connect('axis', '3:node'),
+ ], {'axis': {'value': np.int64(1)}}, nodes_with_edges_only=True)
+
+ (flag, resp) = compare_graphs(graph, graph_ref, 'output', check_op_attrs=True)
+ self.assertTrue(flag, resp)
+
+ def test_ScatterElementsUpdate_has_axis_and_4_inputs(self):
+ graph = build_graph(nodes, [
+ *edges,
+ *connect('axis', '3:node'),
+ ], {'node': {'axis': 1}, 'axis': {'value': np.int64(1)}}, nodes_with_edges_only=True)
+ self.assertRaises(AssertionError, ScatterNormalizer().find_and_replace_pattern, graph)
+
+ def test_ScatterElementsUpdate_has_no_axis_and_3_inputs(self):
+ graph = build_graph(nodes, edges, nodes_with_edges_only=True)
+ self.assertRaises(AssertionError, ScatterNormalizer().find_and_replace_pattern, graph)
+
+ def test_ScatterElementsUpdate_has_no_axis_and_4_inputs(self):
+ graph = build_graph(nodes, [
+ *edges,
+ *connect('axis', '3:node'),
+ ], {'axis': {'value': np.int64(1)}}, nodes_with_edges_only=True)
+ ScatterNormalizer().find_and_replace_pattern(graph)
+
+ graph_ref = build_graph(nodes, [
+ *edges,
+ *connect('axis', '3:node'),
+ ], {'axis': {'value': np.int64(1)}}, nodes_with_edges_only=True)
+
+ (flag, resp) = compare_graphs(graph, graph_ref, 'output', check_op_attrs=True)
+ self.assertTrue(flag, resp)
import numpy as np
from extensions.front.softsign_replacer import SoftSign
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
nodes_attributes = {
'placeholder_1': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter', 'shape': np.array([1, 227, 227, 3])},
import numpy as np
from extensions.front.squared_difference import SquaredDifference
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
nodes_attributes = {
'placeholder_1': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter', 'shape': np.array([1, 227, 227, 3])},
import logging as log
-import networkx as nx
-
from mo.front.common.replacement import FrontReplacementSubgraph
from mo.graph.graph import Graph
import logging as log
-import networkx as nx
-
from mo.front.common.replacement import FrontReplacementOp
from mo.graph.graph import Node, Graph
from mo.utils.error import Error
limitations under the License.
"""
-from mo.front.extractor import FrontExtractorOp
from mo.front.common.partial_infer.elemental import single_output_infer
+from mo.front.extractor import FrontExtractorOp
class LoopCondFrontExtractor(FrontExtractorOp):
limitations under the License.
"""
-import numpy as np
-
from extensions.front.standalone_const_eraser import StandaloneConstEraser
from extensions.ops.DetectionOutput import DetectionOutput
from mo.front.common.partial_infer.utils import int64_array
"""
import logging as log
+from typing import Optional
from extensions.ops.elementwise import Mul
from extensions.ops.interpolate import Interpolate
from mo.ops.const import Const
from mo.ops.shape import Shape
from mo.ops.strided_slice import StridedSlice
-from typing import Optional
def get_concat_after_split(split: Node) -> Optional[Node]:
from extensions.front.tf.SplitConcatPairToInterpolate import SplitConcatPairToInterpolate
from mo.front.common.partial_infer.utils import int64_array
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
-
+from mo.utils.unittest.graph import build_graph
graph_node_attrs_for_2d_spatial_case = {
'placeholder': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
limitations under the License.
"""
from extensions.ops.select import Select
-from mo.graph.graph import Graph
from mo.front.common.replacement import FrontReplacementSubgraph
+from mo.graph.graph import Graph
class SwitchMergeOptimization(FrontReplacementSubgraph):
from extensions.front.tf.SwitchMergeOptimization import SwitchMergeOptimization
from mo.front.common.partial_infer.utils import int64_array
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
class SwitchMergeOptimizationTest(unittest.TestCase):
limitations under the License.
"""
-from mo.front.extractor import FrontExtractorOp
from extensions.ops.TensorArrayGather import TensorArrayGather
-from mo.front.tf.extractors.utils import tf_int_list, tf_tensor_shape
+from mo.front.extractor import FrontExtractorOp
+from mo.front.tf.extractors.utils import tf_tensor_shape
from mo.graph.graph import Node
import networkx as nx
from mo.front.common.replacement import FrontReplacementOp
-from mo.graph.graph import Node, Graph
+from mo.graph.graph import Graph
from mo.utils.error import Error
import logging as log
-import numpy as np
-
-from extensions.ops.bucketize import Bucketize
from mo.front.common.replacement import FrontReplacementSubgraph
-from mo.graph.graph import Graph, Node
+from mo.graph.graph import Graph
from mo.ops.const import Const
from mo.front.common.partial_infer.utils import convert_tf_padding_to_str, int64_array
from mo.front.extractor import FrontExtractorOp
-from mo.front.tf.extractors.utils import tf_data_format_spatial, tf_data_format_channel, tf_data_format_batch, \
+from mo.front.tf.extractors.utils import tf_data_format_channel, tf_data_format_batch, \
tf_int_list
from mo.ops.convolution import Convolution
from mo.ops.op import PermuteAttrs
attrs = {
'type': 'Convolution',
- 'auto_pad': convert_tf_padding_to_str(node.pb.attr['padding']),
+ 'auto_pad': convert_tf_padding_to_str(node.pb.attr['padding'].s.decode()),
'bias_addable': True,
'bias_term': False,
'dilation': dilations,
See the License for the specific language governing permissions and
limitations under the License.
"""
-import numpy as np
-from mo.front.common.partial_infer.utils import convert_tf_padding_to_str, int64_array
+from mo.front.common.partial_infer.utils import convert_deconv_tf_padding_to_str, int64_array
from mo.front.extractor import FrontExtractorOp
from mo.front.tf.extractors.utils import tf_data_format_spatial, tf_data_format_channel, tf_data_format_batch, \
tf_int_list
data_format = node.pb.attr["data_format"]
return {
- 'auto_pad': convert_tf_padding_to_str(node.pb.attr['padding']),
+ 'auto_pad': convert_deconv_tf_padding_to_str(node.pb.attr['padding'].s.decode()),
'bias_addable': True,
'bias_term': False,
'spatial_dims': tf_data_format_spatial(data_format),
"""
import logging as log
-import numpy as np
-
from mo.front.common.partial_infer.utils import int64_array
from mo.front.common.replacement import FrontReplacementOp
from mo.front.tf.extractors.utils import tf_dtype_extractor
-from mo.graph.graph import Node, Graph
+from mo.graph.graph import Graph
from mo.ops.const import Const
import unittest
from extensions.front.tf.mvn_unrolled import MVNUnrolled
+from extensions.ops.mvn import MVN
from mo.ops.op import Op
-from mo.utils.unittest.graph import build_graph_with_attrs
from mo.utils.ir_engine.compare_graphs import compare_graphs
-from extensions.ops.mvn import MVN
+from mo.utils.unittest.graph import build_graph_with_attrs
class MVNUnrolledMatchingTests(unittest.TestCase):
See the License for the specific language governing permissions and
limitations under the License.
"""
-import numpy as np
from mo.front.common.partial_infer.utils import convert_tf_padding_to_str
from mo.front.extractor import FrontExtractorOp
-from mo.front.tf.extractors.utils import tf_data_format_spatial, tf_data_format_channel, tf_data_format_batch, \
- tf_int_list
+from mo.front.tf.extractors.utils import tf_data_format_spatial, tf_int_list
from mo.ops.pooling import Pooling
data_format = node.pb.attr["data_format"]
attrs = {
- 'auto_pad': convert_tf_padding_to_str(node.pb.attr['padding']),
+ 'auto_pad': convert_tf_padding_to_str(node.pb.attr['padding'].s.decode()),
'window': tf_int_list(node.pb.attr["ksize"].list),
'spatial_dims': tf_data_format_spatial(data_format),
'pad': None, # will be inferred when input shape is known
"""
from mo.front.extractor import FrontExtractorOp
-from mo.ops.softmax import Softmax
+from mo.ops.softmax import LogSoftmax, Softmax
-class SoftmaxFrontExtractor(FrontExtractorOp):
+class SoftmaxExtractor(FrontExtractorOp):
op = 'Softmax'
enabled = True
axis = node.pb.attr['axis'].i
Softmax.update_node_stat(node, {'axis': axis})
return cls.enabled
+
+
+class LogSoftmaxExtractor(FrontExtractorOp):
+ op = 'LogSoftmax'
+ enabled = True
+
+ @classmethod
+ def extract(cls, node):
+ # the default value for the TF LogSoftmax is -1
+ axis = -1
+ if 'axis' in node.pb.attr:
+ axis = node.pb.attr['axis'].i
+ LogSoftmax.update_node_stat(node, {'axis': axis})
+ return cls.enabled
See the License for the specific language governing permissions and
limitations under the License.
"""
-from extensions.ops.split import Split
from extensions.ops.elementwise import Sub
from extensions.ops.rank import Rank
+from extensions.ops.split import Split
from extensions.ops.transpose import Transpose
from mo.front.common.partial_infer.utils import int64_array
from mo.front.common.replacement import FrontReplacementPattern
limitations under the License.
"""
-from mo.front.extractor import FrontExtractorOp
from extensions.ops.space_to_depth import SpaceToDepth
+from mo.front.extractor import FrontExtractorOp
class SpaceToDepthFrontExtractor(FrontExtractorOp):
limitations under the License.
"""
-import numpy as np
-
from extensions.ops.sparse_fill_empty_rows import SparseFillEmptyRows
from mo.front.extractor import FrontExtractorOp
limitations under the License.
"""
-import numpy as np
-
from extensions.ops.sparse_segment_mean import SparseSegmentMean
from mo.front.extractor import FrontExtractorOp
limitations under the License.
"""
-import numpy as np
-
from extensions.ops.sparse_segment_sqrtn import SparseSegmentSqrtN
from mo.front.extractor import FrontExtractorOp
limitations under the License.
"""
-import numpy as np
-
from extensions.ops.sparse_segment_sum import SparseSegmentSum
from mo.front.extractor import FrontExtractorOp
limitations under the License.
"""
-import numpy as np
-
from extensions.ops.sparse_to_dense import SparseToDense
from mo.front.extractor import FrontExtractorOp
from extensions.ops.sparse_weighted_sum import ExperimentalSparseWeightedSum
from mo.front.common.replacement import FrontReplacementSubgraph
-from mo.graph.graph import Node, Graph
-from mo.ops.op import Op
-from mo.ops.shape import Shape
+from mo.graph.graph import Graph
class ExperimentalSparseWeightedSumFrontReplacer(FrontReplacementSubgraph):
import unittest
-from extensions.front.tf.sparse_weighted_sum import ExperimentalSparseWeightedSumFrontReplacer, ExperimentalSparseWeightedSumFrontReplacer2
+from extensions.front.tf.sparse_weighted_sum import ExperimentalSparseWeightedSumFrontReplacer, \
+ ExperimentalSparseWeightedSumFrontReplacer2
from mo.front.common.partial_infer.utils import int64_array
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
class ExperimentalSparseWeightedSumFrontReplacersTest(unittest.TestCase):
import numpy as np
from extensions.front.tf.swish import Swish
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
nodes_attributes = {
'placeholder_1': {'shape': np.array([1, 227, 227, 3]), 'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
limitations under the License.
"""
-from mo.front.extractor import FrontExtractorOp
from extensions.ops.topk import TopK
+from mo.front.extractor import FrontExtractorOp
class TopKExtractor(FrontExtractorOp):
limitations under the License.
"""
-import numpy as np
-
from extensions.ops.unique import Unique
from mo.front.extractor import FrontExtractorOp
import numpy as np
+from extensions.middle.ApplyNHWCtoNCHWpermutation import ApplyNHWCtoNCHWpermutation
from extensions.middle.InsertLayoutPropagationTransposes import is_input_data_in_correct_layout, \
is_output_data_in_correct_layout
-from extensions.middle.ApplyNHWCtoNCHWpermutation import ApplyNHWCtoNCHWpermutation
from extensions.middle.pass_separator import PostMiddleStart
from mo.front.common.partial_infer.utils import int64_array
from mo.graph.graph import Graph, Node
See the License for the specific language governing permissions and
limitations under the License.
"""
+import numpy as np
+
from extensions.middle.EltwiseChecker import EltwiseChecker
from extensions.ops.elementwise import Add
from mo.front.common.layout import get_features_dim
from mo.middle.replacement import MiddleReplacementPattern
from mo.ops.const import Const
from mo.ops.unsqueeze import Unsqueeze
-import numpy as np
class BiasAddInputBroadcasting(MiddleReplacementPattern):
limitations under the License.
"""
-from mo.graph.graph import Node, Graph
+from mo.graph.graph import Graph
from mo.middle.passes.eliminate import remove_op_node_with_data_node
from mo.middle.replacement import MiddleReplacementPattern
from extensions.middle.ConvertGroupedStridedSlice import ConvertGroupedStridedSlice
from mo.front.common.partial_infer.utils import int64_array
from mo.graph.graph import Node
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
nodes_attributes = {
'placeholder_1': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
limitations under the License.
"""
+import unittest
+
import numpy as np
-import unittest
from extensions.middle.CutInputHavingZeroDimFromConcat import CutInputHavingZeroDimFromConcat
from mo.front.common.partial_infer.utils import int64_array
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
-
+from mo.utils.unittest.graph import build_graph
node_attrs_for_the_case_when_there_are_no_zero_shape_constants = {
'const0': {
limitations under the License.
"""
-import numpy as np
-
from extensions.ops.gather import Gather
from mo.front.common.partial_infer.utils import int64_array
+from mo.graph.graph import Graph
from mo.middle.replacement import MiddleReplacementPattern
-from mo.ops.op import PermuteAttrs
from mo.ops.const import Const
-from mo.graph.graph import Graph, rename_nodes
+from mo.ops.op import PermuteAttrs
class Deconvolution3rdInputNormalization(MiddleReplacementPattern):
from mo.middle.replacement import MiddleReplacementPattern
from mo.ops.const import Const
from mo.ops.op import Op
-from mo.ops.reshape import Reshape
from mo.ops.squeeze import Squeeze
from mo.ops.unsqueeze import Unsqueeze
from typing import Dict
-import logging as log
-
import numpy as np
+from extensions.ops.elementwise import Mul, Add
from extensions.ops.mvn import MVN
from mo.front.common.partial_infer.utils import int64_array
from mo.graph.graph import Graph, Node
from mo.middle.replacement import MiddleReplacementPattern
from mo.ops.const import Const
-from extensions.ops.elementwise import Mul, Add
from mo.ops.reshape import Reshape
from mo.ops.shape import Shape
from mo.utils.shape import node_to_get_spatial_dimensions_value, node_to_get_features_dimension_value, \
from extensions.middle.pass_separator import PostMiddleStart
from extensions.ops.transpose import Transpose
-from mo.middle.replacement import MiddleReplacementPattern
from mo.graph.graph import Graph, Node
+from mo.middle.replacement import MiddleReplacementPattern
from mo.ops.const import Const
from mo.ops.op import PermuteAttrs
See the License for the specific language governing permissions and
limitations under the License.
"""
-import numpy as np
import unittest
+import numpy as np
+
from extensions.middle.InsertSelect import AddSelectBeforeMemoryNodePattern
from mo.front.common.partial_infer.utils import int64_array
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
class InsertSelectTests(unittest.TestCase):
"""
import logging as log
+from typing import List
from extensions.ops.interpolate import Interpolate
from mo.front.common.partial_infer.utils import int64_array
from mo.graph.graph import Graph, Node
from mo.middle.replacement import MiddleReplacementPattern
from mo.ops.const import Const
-from typing import List
def node_has_one_consumer(node: Node) -> bool:
from extensions.middle.InterpolateSequenceToInterpolate import InterpolateSequenceToInterpolate
from mo.front.common.partial_infer.utils import int64_array
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
-
+from mo.utils.unittest.graph import build_graph
graph_node_attrs_for_2d_case_1 = {
'placeholder': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
"""
import unittest
+
import numpy as np
+
from extensions.middle.L2NormToNorm import L2NormToNorm
from mo.utils.ir_engine.compare_graphs import compare_graphs
from mo.utils.unittest.graph import build_graph_with_attrs
from mo.middle.replacement import MiddleReplacementPattern
from mo.ops.const import Const
from mo.ops.op import Op
-from mo.ops.reshape import Reshape
from mo.ops.squeeze import Squeeze
from mo.ops.unsqueeze import Unsqueeze
"""
import numpy as np
-from mo.front.common.partial_infer.utils import int64_array
from mo.graph.graph import Graph, Node
from mo.middle.replacement import MiddleReplacementPattern
from mo.ops.concat import Concat
See the License for the specific language governing permissions and
limitations under the License.
"""
-import numpy as np
from mo.front.common.partial_infer.utils import int64_array
from mo.front.tf.graph_utils import create_op_node_with_second_input
from mo.graph.graph import Graph
from mo.middle.replacement import MiddleReplacementPattern
-from mo.ops.const import Const
-from mo.ops.reshape import Reshape
-from mo.ops.tile import Tile
+from mo.ops.unsqueeze import Unsqueeze
class MXTileReplacer(MiddleReplacementPattern):
"""
- This class Reshape Tile operation if len input shape < output shape.
+ Aligns Tile operation from MxNet framework with OpenVINO Tile
+
+ MxNet has no restrictions for `tile_array` input of `Tile` operation.
+ If len(tile_array) > rank(data), this transformation will insert Unsqueeze before Tile operation,
+ because in this case output_shape > input_shape
+
+ DOC link: https://beta.mxnet.io/api/ndarray/_autogen/mxnet.ndarray.tile.html#mxnet.ndarray.tile
"""
enabled = True
- force_clean_up = True
def pattern(self):
return dict(
@staticmethod
def replace_pattern(graph: Graph, match: dict):
- mxtile = match['tile']
-
- in_shape = mxtile.in_port(0).data.get_shape()
- out_shape = mxtile.out_node(0).shape
-
- tile_array_diff = (len(out_shape) - len(in_shape))
- if tile_array_diff > 0:
- reshape_shape = np.copy(in_shape)
- for i in range(tile_array_diff):
- reshape_shape = np.insert(in_shape, 0, 1, axis=0)
- reshape_node = create_op_node_with_second_input(graph, Reshape, int64_array(reshape_shape), dict(name=mxtile.id + "/Reshape"))
- mxtile.in_port(0).get_source().get_connection().set_destination(reshape_node.in_port(0))
- reshape_node.out_port(0).get_connection().set_destination(mxtile.in_port(0))
+ node = match['tile']
+ name = node.soft_get('name', node.id)
+ in_shape = node.in_port(0).data.get_shape()
+ out_shape = node.out_port(0).data.get_shape()
+
+ tile_array_diff = len(out_shape) - len(in_shape)
+ if tile_array_diff == 0:
+ return
+ assert tile_array_diff > 0,\
+ 'Unexpected difference between rank(input) and rank(output) for node {}'.format(name)
+ unsqueeze_dims = int64_array(range(tile_array_diff))
+ unsqueeze = create_op_node_with_second_input(graph, Unsqueeze, unsqueeze_dims,
+ dict(name=name + '/Unsqueeze', override_output_shape=True))
+ node.in_port(0).get_connection().insert_node(unsqueeze)
--- /dev/null
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import unittest
+
+from extensions.middle.MXTileReplacer import MXTileReplacer
+from mo.front.common.partial_infer.utils import int64_array
+from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
+
+nodes_attributes = {
+ 'placeholder': {'kind': 'op', 'op': 'Parameter'},
+ 'placeholder_data': {'kind': 'data'},
+ 'tile': {'kind': 'op', 'op': 'Tile'},
+ 'tile_data': {'kind': 'data', 'shape': int64_array([1, 1, 1, 1])},
+ 'result': {'kind': 'op', 'op': 'Result'},
+
+ 'unsqueeze_1': {'kind': 'op', 'op': 'Unsqueeze'},
+ 'unsqueeze_1_data': {'kind': 'data'},
+ 'unsqueeze_1_const': {'kind': 'op', 'op': 'Const'},
+ 'unsqueeze_1_const_data': {'kind': 'data'},
+}
+
+
+class MXTileReplacerTest(unittest.TestCase):
+
+ def test_insert_one_unsqueeze(self):
+ graph = build_graph(
+ nodes_attributes,
+ [
+ ('placeholder', 'placeholder_data'),
+ ('placeholder_data', 'tile'),
+ ('tile', 'tile_data'),
+ ('tile_data', 'result')
+ ],
+ {
+ 'placeholder_data': {'shape': int64_array([1, 1, 1])}
+ },
+ nodes_with_edges_only=True
+ )
+
+ ref_graph = build_graph(
+ nodes_attributes,
+ [
+ ('placeholder', 'placeholder_data'),
+ ('placeholder_data', 'unsqueeze_1', {'in': 0}),
+ ('unsqueeze_1_const', 'unsqueeze_1_const_data'),
+ ('unsqueeze_1_const_data', 'unsqueeze_1', {'in': 1}),
+ ('unsqueeze_1', 'unsqueeze_1_data'),
+ ('unsqueeze_1_data', 'tile'),
+ ('tile', 'tile_data'),
+ ('tile_data', 'result')
+ ],
+ {
+ 'placeholder_data': {'shape': int64_array([1, 1, 1])},
+ 'unsqueeze_1_const_data': {'value': int64_array([0])}
+ },
+ nodes_with_edges_only=True
+ )
+
+ MXTileReplacer().find_and_replace_pattern(graph)
+ graph.clean_up()
+
+ (flag, resp) = compare_graphs(graph, ref_graph, 'placeholder', check_op_attrs=True)
+ self.assertTrue(flag, resp)
+
+ def test_insert_two_unsqueezes(self):
+ graph = build_graph(
+ nodes_attributes,
+ [
+ ('placeholder', 'placeholder_data'),
+ ('placeholder_data', 'tile'),
+ ('tile', 'tile_data'),
+ ('tile_data', 'result')
+ ],
+ {
+ 'placeholder_data': {'shape': int64_array([1, 1])}
+ },
+ nodes_with_edges_only=True
+ )
+
+ ref_graph = build_graph(
+ nodes_attributes,
+ [
+ ('placeholder', 'placeholder_data'),
+ ('placeholder_data', 'unsqueeze_1', {'in': 0}),
+ ('unsqueeze_1_const', 'unsqueeze_1_const_data'),
+ ('unsqueeze_1_const_data', 'unsqueeze_1', {'in': 1}),
+ ('unsqueeze_1', 'unsqueeze_1_data'),
+ ('unsqueeze_1_data', 'tile'),
+ ('tile', 'tile_data'),
+ ('tile_data', 'result')
+ ],
+ {
+ 'placeholder_data': {'shape': int64_array([1, 1])},
+ 'unsqueeze_1_const_data': {'value': int64_array([0, 1])}
+ },
+ nodes_with_edges_only=True
+ )
+
+ MXTileReplacer().find_and_replace_pattern(graph)
+ graph.clean_up()
+
+ (flag, resp) = compare_graphs(graph, ref_graph, 'placeholder', check_op_attrs=True)
+ self.assertTrue(flag, resp)
import numpy as np
from extensions.middle.MinimumMiddleReplacer import MinimumMiddleReplacer
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
nodes_attributes = {
'placeholder_1': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
import logging as log
from typing import Dict
+
import numpy as np
from extensions.middle.BinarizeWeightsM1P1 import BinarizeWeightsM1P1
import numpy as np
from extensions.middle.ReluQuantizeFuse import ReluQuantizeFuse, ReluFakeQuantizeMark
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
nodes = {
# input
import unittest
from extensions.middle.RemoveDuplicationMemory import RemoveMemoryDuplicationPattern, MergeNeighborSplicePattern
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
class RemoveMemoryDuplicationPatternTests(unittest.TestCase):
from extensions.middle.RemoveUselessConcatSplit import RemoveUselessConcatSplitPattern
from mo.front.common.partial_infer.utils import int64_array
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
class RemoveUselessConcatSplitTests(unittest.TestCase):
import unittest
from extensions.middle.RemoveUselessCrops import RemoveUselessCropsPattern
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
class RemoveUselessCropsPatternTests(unittest.TestCase):
from extensions.middle.ReplaceMemoryOffsetWithSplice import ReplaceMemoryOffsetNodePattern
from mo.graph.graph import Node
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
class ReplaceMemoryOffsetNodePatternTests(unittest.TestCase):
import unittest
from extensions.middle.ReplacePNorm import ReplacePNormNodePattern
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
class ReplacePNormNodePatternTests(unittest.TestCase):
from extensions.middle.ReplaceSpliceNodePattern import ReplaceSpliceNodePattern
from mo.graph.graph import Node
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
class ReplaceSpliceNodePatternTests(unittest.TestCase):
import numpy as np
from extensions.middle.ScaleInput import ScaleInput
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
nodes_attributes = {'node_1': {'type': 'Identity', 'value': None, 'kind': 'op'},
'node_1_data': {'value': None, 'kind': 'data', 'data_type': None},
import numpy as np
from extensions.middle.SharedWeightsDuplication import SharedWeightsDuplication
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
nodes_attributes = {
'const': {'shape': None, 'type': 'Const', 'kind': 'op', 'op': 'Const'},
import numpy as np
from mo.front.common.partial_infer.utils import int64_array
-from mo.graph.graph import Graph, Node, rename_node, rename_nodes
+from mo.graph.graph import Graph, Node, rename_nodes
from mo.middle.replacement import MiddleReplacementPattern
from mo.ops.const import Const
from mo.ops.crop import Crop
graph_condition = [lambda graph: graph.graph['is_cyclic']]
def run_after(self):
- from extensions.middle.TensorIteratorCondition import SimpleConditionMatcher
return [DynamicDecoderConditionMatcher]
def run_before(self):
import unittest
from extensions.middle.TensorIteratorBackEdge import BackEdgesMatching
-from mo.utils.unittest.graph import build_graph_with_attrs
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph_with_attrs
class BackEdgesMatchingTests(unittest.TestCase):
import numpy as np
from extensions.middle.TensorIteratorCondition import LoopConditionMatcher
-from mo.utils.unittest.graph import build_graph_with_attrs
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph_with_attrs
class TensorIteratorConditionTests(unittest.TestCase):
import numpy as np
from extensions.middle.TensorIteratorInput import SmartInputMatcher, SimpleInputMatcher, BackEdgeSimpleInputMatcher
-from mo.utils.unittest.graph import build_graph_with_attrs
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph_with_attrs
class SmartInputMatcherTests(unittest.TestCase):
limitations under the License.
"""
+from extensions.middle.ONNXRNNSequenceNormalize import ONNXRNNSequenceNormalize
from extensions.middle.TF_lstm_cell_to_generic import TensorFlowLSTMtoGeneric
from extensions.middle.TensorIteratorMerge import TensorIteratorMerge
from mo.graph.graph import Graph
from mo.middle.pattern_match import find_isomorphisms
from mo.middle.replacement import MiddleReplacementPattern
from mo.utils.error import Error
-from extensions.middle.ONNXRNNSequenceNormalize import ONNXRNNSequenceNormalize
class TensorIteratorLSTM(MiddleReplacementPattern):
"""
from collections import deque
+from copy import deepcopy
import numpy as np
-from copy import deepcopy
from extensions.ops.tensor_iterator import TensorIterator
from mo.graph.graph import Node, Graph, add_opoutput
import numpy as np
from extensions.middle.TensorIteratorOutput import SmartOutputMatcher
-from mo.utils.unittest.graph import build_graph_with_attrs
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph_with_attrs
class SmartOutputMatcherTests(unittest.TestCase):
from extensions.middle.quantize_fuses import FakeQuantizeFuse
from mo.front.common.partial_infer.eltwise import eltwise_infer
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
nodes = {
'placeholder': {'kind': 'op', 'op': 'Placeholder'},
import numpy as np
-from extensions.ops.identity import IdentityOp
-from mo.graph.graph import Node, Graph
+from mo.graph.graph import Graph
from mo.middle.passes.eliminate import merge_data_nodes
from mo.middle.replacement import MiddleReplacementPattern
from mo.utils.error import Error
from extensions.middle.sparse_reshape import SparseReshapeMiddleReplacer
from mo.front.common.partial_infer.utils import int64_array
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
class SparseReshapeMiddleReplacerTests(unittest.TestCase):
"""
import unittest
-import numpy as np
-
-from extensions.middle.UselessStridedSlice import UselessStridedSliceEraser
from extensions.middle.wights_permute_normalizer import WeightsPermuteNormalizer
from mo.graph.graph import Node
from mo.utils.unittest.graph import build_graph
-from mo.utils.ir_engine.compare_graphs import compare_graphs
nodes_attributes = {
'placeholder': {'type': 'Placeholder', 'kind': 'op', 'op': 'Placeholder'},
limitations under the License.
"""
-import networkx as nx
-
from mo.front.common.partial_infer.utils import mark_input_bins
from mo.graph.graph import Node, Graph
from mo.ops.op import Op
-import numpy as np
class BlockLSTM(Op):
limitations under the License.
"""
-
-import networkx as nx
import numpy as np
+
from mo.graph.graph import Node, Graph
from mo.ops.op import Op
-from mo.utils.error import Error
class Enter(Op):
"""
import numpy as np
+
from mo.graph.graph import Node, Graph
from mo.ops.op import Op
-from mo.utils.error import Error
class Exit(Op):
See the License for the specific language governing permissions and
limitations under the License.
"""
+import numpy as np
+
from extensions.ops.RNN import rnn_infer
from mo.graph.graph import Node, Graph
from mo.ops.op import Op
-import numpy as np
class GRU(Op):
from mo.graph.graph import Node
from mo.utils.unittest.graph import build_graph
-
graph_nodes_attrs = {
'A': {'type': 'Const', 'op': 'Const', 'kind': 'op', 'shape': None, 'value': None},
'A_data': {'kind': 'data', 'shape': None, 'value': None},
limitations under the License.
"""
-import networkx as nx
import numpy as np
from mo.graph.graph import Node, Graph
limitations under the License.
"""
-import networkx as nx
import numpy as np
from mo.graph.graph import Node, Graph
limitations under the License.
"""
-
-import networkx as nx
-
from mo.graph.graph import Node, Graph
from mo.ops.op import Op
limitations under the License.
"""
+from mo.front.common.partial_infer.utils import int64_array
from mo.graph.graph import Graph, Node
from mo.ops.op import Op
from mo.ops.pooling import Pooling
-from mo.front.common.partial_infer.utils import int64_array
class AdaptiveAvgPooling(Op):
"""
import logging as log
+
import numpy as np
from mo.front.caffe.extractors.utils import get_canonical_axis_index
from mo.graph.graph import Node, Graph
from mo.ops.op import Op
-from mo.utils.error import Error
class Assert(Op):
limitations under the License.
"""
-from mo.front.common.partial_infer.utils import int64_array
-from mo.graph.graph import Node, Graph
+from mo.graph.graph import Graph
from mo.ops.op import Op
from mo.graph.graph import Node, Graph
from mo.ops.op import Op
+
class Bucketize(Op):
op = 'Bucketize'
from mo.graph.graph import Node
from mo.utils.unittest.graph import build_graph
-
nodes_attributes = {'input_tensor': {'shape': None, 'value': None, 'kind': 'data'},
'input_buckets': {'shape': None, 'value': None, 'kind': 'data'},
'bucketize_node': {'op': 'Bucketize', 'kind': 'op', 'with_right_bound': False},
# axis - dimension number for tensors concatenation
import copy
-import networkx as nx
-
from mo.graph.graph import Node, Graph
from mo.ops.op import Op
import numpy as np
+from mo.front.common.layout import shape_for_layout, get_height_dim, get_batch_dim, get_features_dim, get_width_dim
from mo.front.common.partial_infer.utils import int64_array
from mo.graph.graph import Node, Graph
from mo.ops.op import Op
if in_shape.size != 4:
raise Error('TensorFlow DepthToSpace operation is supported for 4D \'NHWC\' input layout only. '
'Current input shape is \'{}\''.format(in_shape))
- N, H, W, C = in_shape
+
+ layout = node.graph.graph['layout']
+
+ N = in_shape[get_batch_dim(layout, 4)]
+ H = in_shape[get_height_dim(layout, 4)]
+ W = in_shape[get_width_dim(layout, 4)]
+ C = in_shape[get_features_dim(layout, 4)]
+
block_size = node['block_size']
if C % (block_size ** 2):
raise Error('Feature dimensions of input tensor of DepthToSpace operation have to be divisible by square '
'of DepthToSpace \'block_size\' parameter. Input tensor shape = {}. Feature dimension = {}. '
'block_size = {}'.format(in_shape, C, block_size))
- out_shape = [N, int(H * block_size), int(W * block_size), int(C / (block_size ** 2))]
- if np.prod(in_shape) != np.prod(out_shape):
- return
+
+ out_shape = shape_for_layout(layout,
+ batch=N,
+ features=int(C / (block_size ** 2)),
+ height=int(H * block_size),
+ width=int(W * block_size))
+
+ assert np.prod(in_shape) == np.prod(out_shape)
node.out_node().shape = int64_array(out_shape)
"""
import unittest
-
import numpy as np
-
from extensions.ops.depth_to_space import DepthToSpaceOp
from mo.graph.graph import Node
from mo.utils.error import Error
class TestDepthToSpacePartialInfer(unittest.TestCase):
- def test_tf_depth_to_space_infer(self):
+ def test_tf_depth_to_space_infer_nhwc(self):
graph = build_graph(nodes, edges)
+ graph.graph['layout'] = 'NHWC'
dts_node = Node(graph, 'DtS')
DepthToSpaceOp.infer(dts_node)
exp_shape = np.array([1, 2048, 1152, 64])
res_shape = graph.node['out_data_node']['shape']
self.assertTrue(np.array_equal(exp_shape, res_shape))
+ def test_tf_depth_to_space_infer_nchw(self):
+ graph = build_graph(nodes, edges)
+ graph.graph['layout'] = 'NCHW'
+ graph.node['in_data_node']['shape'] = np.array([1, 256, 1024, 576])
+ dts_node = Node(graph, 'DtS')
+ DepthToSpaceOp.infer(dts_node)
+ exp_shape = np.array([1, 64, 2048, 1152])
+ res_shape = graph.node['out_data_node']['shape']
+ self.assertTrue(np.array_equal(exp_shape, res_shape))
+
def test_tf_depth_to_space_infer_error(self):
graph = build_graph(nodes, edges)
+ graph.graph['layout'] = 'NHWC'
graph.node['in_data_node']['shape'] = np.array([1024, 576, 256])
dts_node = Node(graph, 'DtS')
self.assertRaises(Error, DepthToSpaceOp.infer, dts_node)
- def test_tf_depth_to_space_infer_error_1(self):
+ def test_tf_depth_to_space_infer_divisibility_error_1(self):
graph = build_graph(nodes, edges)
+ graph.graph['layout'] = 'NHWC'
graph.node['in_data_node']['shape'] = np.array([1, 1024, 576, 255])
dts_node = Node(graph, 'DtS')
self.assertRaises(Error, DepthToSpaceOp.infer, dts_node)
+
+ def test_tf_depth_to_space_infer_divisibility_error_2(self):
+ graph = build_graph(nodes, edges)
+ graph.graph['layout'] = 'NCHW'
+ graph.node['in_data_node']['shape'] = np.array([1, 255, 1024, 576])
+ dts_node = Node(graph, 'DtS')
+ self.assertRaises(Error, DepthToSpaceOp.infer, dts_node)
+
import numpy as np
from generator import generator, generate
-from extensions.ops.elementwise import Div, Elementwise
+from extensions.ops.elementwise import Div
from mo.front.common.partial_infer.utils import int64_array
from mo.graph.graph import Node
from mo.utils.unittest.graph import build_graph
-
graph_nodes_attrs = {
'A': {'type': 'Const', 'op': 'Const', 'kind': 'op', 'shape': None, 'value': None},
'A_data': {'kind': 'data', 'shape': None, 'value': None},
limitations under the License.
"""
-import logging as log
-import networkx as nx
import numpy as np
-from mo.front.caffe.extractors.utils import get_canonical_axis_index
from mo.graph.graph import Node, Graph
-from mo.ops.op import Op, PermuteAttrs
+from mo.ops.op import Op
class ExpOp(Op):
limitations under the License.
"""
-import networkx as nx
-
from mo.front.common.partial_infer.elemental import copy_shape_infer
from mo.graph.graph import Graph
from mo.ops.op import Op
limitations under the License.
"""
-import networkx as nx
-
from mo.graph.graph import Graph
from mo.ops.op import Op
import unittest
-import networkx as nx
-from mo.graph.graph import Graph
from extensions.ops.instance_normalization import InstanceNormalization
+from mo.graph.graph import Graph
class InstanceNormalizationOp(unittest.TestCase):
limitations under the License.
"""
-import networkx as nx
-
from mo.front.common.partial_infer.utils import mark_input_bins
from mo.graph.graph import Node, Graph
from mo.ops.op import Op
"""
import unittest
+
import numpy as np
from extensions.ops.merge import Merge
from mo.graph.graph import Node
-from mo.utils.unittest.graph import build_graph_with_attrs
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph_with_attrs
class TestMerge(unittest.TestCase):
limitations under the License.
"""
-import numpy as np
-
-from mo.front.caffe.extractors.utils import get_canonical_axis_index
-from mo.graph.graph import Node, Graph
+from mo.graph.graph import Graph
from mo.ops.op import Op
See the License for the specific language governing permissions and
limitations under the License.
"""
-import numpy as np
import logging as log
+import numpy as np
+
from mo.front.common.partial_infer.utils import int64_array
from mo.graph.graph import Node, Graph
from mo.ops.op import Op
"""
- Copyright (C) 2017-2020 Intel Corporation
+ Copyright (C) 2018-2020 Intel Corporation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
from mo.front.common.partial_infer.utils import int64_array
from mo.graph.graph import Node, Graph
+from mo.middle.passes.convert_data_type import np_data_type_to_destination_type
from mo.ops.op import Op
class NonZero(Op):
op = 'NonZero'
+ enabled = False
def __init__(self, graph: Graph, attrs: dict):
+ assert 'output_type' in attrs, 'NonZero has mandatory `output_type` attribute'
mandatory_props = {
- 'type': None,
- 'op': __class__.op,
- 'infer': NonZero.infer,
+ 'op': self.op,
+ 'type': self.op,
+ 'version': 'opset3',
+
+ 'infer': self.infer,
+ 'type_infer': self.type_infer,
+
'in_ports_count': 1,
'out_ports_count': 1,
}
super().__init__(graph, mandatory_props, attrs)
+ def backend_attrs(self):
+ return [
+ ('output_type', lambda node: np_data_type_to_destination_type(node.output_type)),
+ ]
+
@staticmethod
def infer(node: Node):
- input_shape = node.in_node(0).shape
- if input_shape is None:
- return
- input_value = node.in_node(0).value
+ node_name = node.soft_get('name', node.id)
+ input_shape = node.in_port(0).data.get_shape()
+ assert input_shape is not None, 'The input shape for node "{}" is None'.format(node_name)
+ assert node.has_valid('output_type'), \
+ '`output_type` attribute is not set for NonZero node `{}`'.format(node_name)
+ assert node.output_type in [np.int64, np.int32], \
+ 'NonZero `output_type` attribute must be int32 or int64, `{}` found'.format(np.dtype(node.output_type).name)
+
+ input_value = node.in_port(0).data.get_value()
if input_value is not None:
- node.out_port(0).data.set_value(np.array(np.nonzero(input_value)))
+ node.out_port(0).data.set_value(np.array(np.nonzero(input_value), dtype=node.output_type))
else:
- node.out_port(0).data.set_shape(int64_array([len(input_shape), *input_shape]))
+ # output shape of NonZero should be [input_rank, dynamic]
+ # having restriction to save IR with static shape only we count upper-bound shape value here
+ node.out_port(0).data.set_shape(int64_array([len(input_shape), np.prod(input_shape)]))
+ @staticmethod
+ def type_infer(node):
+ assert node.output_type in [np.int64, np.int32], \
+ 'NonZero `output_type` attribute must be int32 or int64, `{}` found'.format(np.dtype(node.output_type).name)
+ node.out_port(0).set_data_type(node.output_type)
limitations under the License.
"""
-from mo.front.common.partial_infer.utils import mark_input_bins
from mo.front.common.partial_infer.elemental import copy_shape_infer
+from mo.front.common.partial_infer.utils import mark_input_bins
from mo.graph.graph import Graph, Node
from mo.ops.op import Op
from mo.utils.utils import convert_param_type
limitations under the License.
"""
-import numpy as np
-import networkx as nx
-
-from mo.graph.graph import Node, Graph
+from mo.graph.graph import Graph
from mo.ops.op import Op
limitations under the License.
"""
-import networkx as nx
-
from mo.front.common.partial_infer.elemental import copy_shape_infer
from mo.graph.graph import Graph
from mo.ops.op import Op
limitations under the License.
"""
-import networkx as nx
import numpy as np
from mo.graph.graph import Node, Graph
import numpy as np
from mo.front.common.partial_infer.elemental import copy_shape_infer
+from mo.front.common.partial_infer.utils import mark_input_bins
from mo.graph.graph import Graph
from mo.ops.op import Op
-from mo.front.common.partial_infer.utils import mark_input_bins
class PreluOp(Op):
limitations under the License.
"""
-import networkx as nx
-
from extensions.ops.proposal import ProposalOp
from mo.front.caffe.extractor import register_caffe_python_extractor
from mo.graph.graph import Graph
limitations under the License.
"""
-import networkx as nx
import numpy as np
from mo.graph.graph import Node, Graph
--- /dev/null
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+from mo.front.common.layout import get_features_dim, shape_for_layout
+from mo.graph.graph import Graph
+from mo.ops.op import Op
+
+
+class ROIAlign(Op):
+ op = 'ROIAlign'
+ enabled = False
+
+ def __init__(self, graph: Graph, attrs: dict):
+ assert 'mode' in attrs, '`mode` attribute is not set for ROIAlign during creation'
+ assert 'pooled_h' in attrs, '`pooled_h` attribute is not set for ROIAlign during creation'
+ assert 'pooled_w' in attrs, '`pooled_w` attribute is not set for ROIAlign during creation'
+ assert 'sampling_ratio' in attrs, '`sampling_ratio` attribute is not set for ROIAlign during creation'
+ assert 'spatial_scale' in attrs, '`spatial_scale` attribute is not set for ROIAlign during creation'
+
+ super().__init__(graph, {
+ 'op': self.op,
+ 'type': self.op,
+ 'version': 'opset3',
+
+ 'infer': self.infer,
+
+ 'in_ports_count': 3,
+ 'out_ports_count': 1,
+ }, attrs)
+
+ def backend_attrs(self):
+ return [
+ ('mode', lambda node: str(node.mode)),
+ ('pooled_h', lambda node: str(int(node.pooled_h))),
+ ('pooled_w', lambda node: str(int(node.pooled_w))),
+ ('sampling_ratio', lambda node: str(int(node.sampling_ratio))),
+ ('spatial_scale', lambda node: str(float(node.spatial_scale))),
+ ]
+
+ @staticmethod
+ def infer(node):
+ layout = node.graph.graph['layout']
+ node_name = node.soft_get('name', node.id)
+
+ assert len([port for port in node.in_ports().values() if not port.disconnected()]) == 3, \
+ 'The node "{}" must 3 inputs'.format(node_name)
+
+ assert node.has_valid('pooled_w'), '"pooled_w" attribute is not set for node "{}"'.format(node_name)
+ assert node.has_valid('pooled_h'), '"pooled_h" attribute is not set for node "{}"'.format(node_name)
+ assert node.has_valid('mode'), '"mode" attribute is not set for node "{}"'.format(node_name)
+ assert node.mode in ['avg', 'max'], \
+ '"mode" attribute range of values is ["avg", "max"], got {} for node "{}"'.format(node.mode, node_name)
+
+ input_shape = node.in_port(0).data.get_shape()
+ rois_shape = node.in_port(1).data.get_shape()
+ indices_shape = node.in_port(2).data.get_shape()
+ assert input_shape is not None and rois_shape is not None and indices_shape is not None, \
+ 'The node "{}" input shape is None'.format(node_name)
+ assert rois_shape[0] == indices_shape[0], 'The number of batch indices does not correspond to number of ROIs ' \
+ 'for node "{}"'.format(node_name)
+ assert rois_shape[1] == 4, 'The size of ROI element must be 4 for node "{}"'.format(node_name)
+ assert len(input_shape) == 4, 'The rank of port 0 input tensor of node "{}" must be 4.'.format(node_name)
+
+ node.out_port(0).data.set_shape(
+ shape_for_layout(layout,
+ batch=rois_shape[0],
+ features=input_shape[get_features_dim(layout, 4)],
+ height=node.pooled_h,
+ width=node.pooled_w)
+ )
--- /dev/null
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.graph.graph import Node, Graph
+from mo.ops.op import Op
+
+
+class Scatter(Op):
+ enabled = False
+
+ op = op_type = None
+ version = None
+
+ def __init__(self, graph: Graph, attrs: dict):
+ assert self.op is not None and self.op_type is not None and self.version is not None, \
+ 'Please use specialized Scatter operation class, Scatter is base class'
+
+ mandatory_props = {
+ 'op': self.op,
+ 'type': self.op_type,
+ 'version': self.version,
+
+ 'is_scatter': True, # is used for gathering all types of scatters in common transformations
+ 'infer': self.infer,
+
+ 'in_ports_count': 4,
+ 'out_ports_count': 1,
+ }
+ super().__init__(graph, mandatory_props, attrs)
+
+ @staticmethod
+ def infer(node: Node):
+ node_name = node.soft_get('name', node.id)
+
+ input_shape = node.in_port(0).data.get_shape()
+ indices_shape = node.in_port(1).data.get_shape()
+ updates_shape = node.in_port(2).data.get_shape()
+ assert input_shape is not None and updates_shape is not None and indices_shape is not None, \
+ 'The node "{}" input shape is None'.format(node_name)
+
+ node.out_port(0).data.set_shape(input_shape)
+
+
+class ScatterElementsAdd(Scatter):
+ op = 'ScatterElementsAdd'
+ op_type = None
+ version = None
+
+
+class ScatterElementsDiv(Scatter):
+ op = 'ScatterElementsDiv'
+ op_type = None
+ version = None
+
+
+class ScatterElementsMax(Scatter):
+ op = 'ScatterElementsMax'
+ op_type = None
+ version = None
+
+
+class ScatterElementsMin(Scatter):
+ op = 'ScatterElementsMin'
+ op_type = None
+ version = None
+
+
+class ScatterElementsMul(Scatter):
+ op = 'ScatterElementsMul'
+ op_type = None
+ version = 'opset3'
+
+
+class ScatterElementsSub(Scatter):
+ op = 'ScatterElementsSub'
+ op_type = None
+ version = None
+
+
+class ScatterElementsUpdate(Scatter):
+ op = op_type = 'ScatterElementsUpdate'
+ version = 'opset3'
+
+
+class ScatterAdd(Scatter):
+ op = 'ScatterAdd'
+ op_type = None
+ version = None
+
+
+class ScatterDiv(Scatter):
+ op = 'ScatterDiv'
+ op_type = None
+ version = None
+
+
+class ScatterMax(Scatter):
+ op = 'ScatterMax'
+ op_type = None
+ version = None
+
+
+class ScatterMin(Scatter):
+ op = 'ScatterMin'
+ op_type = None
+ version = None
+
+
+class ScatterMul(Scatter):
+ op = 'ScatterMul'
+ op_type = None
+ version = None
+
+
+class ScatterSub(Scatter):
+ op = 'ScatterSub'
+ op_type = None
+ version = None
+
+
+class ScatterUpdate(Scatter):
+ op = op_type = 'ScatterUpdate'
+ version = 'opset3'
"""
import unittest
+
import numpy as np
from extensions.ops.select import Select
from mo.graph.graph import Node
-from mo.utils.unittest.graph import build_graph_with_attrs
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph_with_attrs
class TestSelect(unittest.TestCase):
import logging as log
-import networkx as nx
import numpy as np
from mo.front.extractor import attr_getter
limitations under the License.
"""
-import logging as log
-
import numpy as np
+from mo.front.common.layout import shape_for_layout, get_height_dim, get_batch_dim, get_features_dim, get_width_dim
from mo.front.common.partial_infer.utils import int64_array
from mo.graph.graph import Node, Graph
from mo.ops.op import Op
+from mo.utils.error import Error
class SpaceToDepth(Op):
def infer(node: Node):
in_shape = node.in_node().shape
if in_shape.size != 4:
- log.error('TensorFlow SpaceToDepth operation is supported for 4D \'NHWC\' input layout only. '
- 'Current input shape is \'{}\''.format(in_shape))
- return
- N, H, W, C = in_shape
+ raise Error('TensorFlow SpaceToDepth operation is supported for 4D \'NHWC\' input layout only. '
+ 'Current input shape is \'{}\''.format(in_shape))
+
+ layout = node.graph.graph['layout']
+ N = in_shape[get_batch_dim(layout, 4)]
+ H = in_shape[get_height_dim(layout, 4)]
+ W = in_shape[get_width_dim(layout, 4)]
+ C = in_shape[get_features_dim(layout, 4)]
+
block_size = node['block_size']
if H % block_size or W % block_size:
- log.error('Spatial dimensions of input tensor of SpaceToDepth operation have to be divisible by '
- 'SpaceToDepth \'block_size\' parameter. Input tensor shape = {}. Spatial dimensions = {},{}. '
- 'block_size = {}'.format(in_shape, H, W, block_size))
- return
- out_shape = [N, int(H / block_size), int(W / block_size), int(C * (block_size ** 2))]
+ raise Error('Spatial dimensions of input tensor of SpaceToDepth operation have to be divisible by '
+ 'SpaceToDepth \'block_size\' parameter. Input tensor shape = {}. Spatial dimensions = {},{}. '
+ 'block_size = {}'.format(in_shape, H, W, block_size))
+
+ out_shape = shape_for_layout(layout,
+ batch=N,
+ features=int(C * (block_size ** 2)),
+ height=int(H / block_size),
+ width=int(W / block_size))
+
assert np.prod(in_shape) == np.prod(out_shape)
node.out_node().shape = int64_array(out_shape)
--- /dev/null
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import unittest
+import numpy as np
+from extensions.ops.space_to_depth import SpaceToDepth
+from mo.graph.graph import Node
+from mo.utils.error import Error
+from mo.utils.unittest.graph import build_graph
+
+nodes = {
+ 'in_data_node': {'value': None, 'kind': 'data', 'shape': np.array([1, 2048, 1152, 64])},
+ 'StD': {'op': 'SpaceToDepth', 'kind': 'op', 'block_size': 2},
+ 'out_data_node': {'value': None, 'kind': 'data', 'shape': None}
+}
+
+edges = [
+ ('in_data_node', 'StD'),
+ ('StD', 'out_data_node')
+]
+
+class TestSpaceToDepthPartialInfer(unittest.TestCase):
+ def test_tf_space_to_depth_infer_nhwc(self):
+ graph = build_graph(nodes, edges)
+ graph.graph['layout'] = 'NHWC'
+ std_node = Node(graph, 'StD')
+ SpaceToDepth.infer(std_node)
+ exp_shape = np.array([1, 1024, 576, 256])
+ res_shape = graph.node['out_data_node']['shape']
+ self.assertTrue(np.array_equal(exp_shape, res_shape))
+
+ def test_tf_space_to_depth_infer_nchw(self):
+ graph = build_graph(nodes, edges)
+ graph.graph['layout'] = 'NCHW'
+ graph.node['in_data_node']['shape'] = np.array([1, 64, 2048, 1152])
+ std_node = Node(graph, 'StD')
+ SpaceToDepth.infer(std_node)
+ exp_shape = np.array([1, 256, 1024, 576])
+ res_shape = graph.node['out_data_node']['shape']
+ self.assertTrue(np.array_equal(exp_shape, res_shape))
+
+ def test_tf_space_to_depth_infer_shape_error(self):
+ graph = build_graph(nodes, edges)
+ graph.graph['layout'] = 'NHWC'
+ graph.node['in_data_node']['shape'] = np.array([1024, 576, 256])
+ std_node = Node(graph, 'StD')
+ self.assertRaises(Error, SpaceToDepth.infer, std_node)
+
+ def test_tf_space_to_depth_infer_divisibility_error_1(self):
+ graph = build_graph(nodes, edges)
+ graph.graph['layout'] = 'NHWC'
+ graph.node['in_data_node']['shape'] = np.array([1, 1024, 577, 256])
+ std_node = Node(graph, 'StD')
+ self.assertRaises(Error, SpaceToDepth.infer, std_node)
+
+ def test_tf_space_to_depth_infer_divisibility_error_2(self):
+ graph = build_graph(nodes, edges)
+ graph.graph['layout'] = 'NCHW'
+ graph.node['in_data_node']['shape'] = np.array([1, 256, 1024, 577])
+ std_node = Node(graph, 'StD')
+ self.assertRaises(Error, SpaceToDepth.infer, std_node)
\ No newline at end of file
import logging as log
-import networkx as nx
import numpy as np
from mo.graph.graph import Node, Graph
from mo.graph.graph import Node
from mo.utils.unittest.graph import build_graph
-
nodes_attributes = {'input_indices': {'shape': None, 'value': None, 'kind': 'data'},
'input_values': {'shape': None, 'value': None, 'kind': 'data'},
'dense_shape': {'shape': None, 'value': None, 'kind': 'data'},
from mo.graph.graph import Node
from mo.utils.unittest.graph import build_graph
-
nodes_attributes = {'input_indices': {'shape': None, 'value': None, 'kind': 'data'},
'input_shape': {'shape': None, 'value': None, 'kind': 'data'},
'new_shape': {'shape': None, 'value': None, 'kind': 'data'},
limitations under the License.
"""
-import logging as log
-
-import networkx as nx
import numpy as np
from mo.graph.graph import Node, Graph
limitations under the License.
"""
-import logging as log
-
-import networkx as nx
import numpy as np
from mo.graph.graph import Node, Graph
limitations under the License.
"""
-import logging as log
-
-import networkx as nx
import numpy as np
from mo.graph.graph import Node, Graph
from mo.graph.graph import Node
from mo.utils.unittest.graph import build_graph
-
# graph 1
nodes_attributes = {
'input_indices': {'kind': 'op', 'op': 'Parameter', 'shape': int64_array([5, 2])},
limitations under the License.
"""
-import numpy as np
-
from mo.front.common.partial_infer.utils import int64_array
from mo.graph.graph import Node, Graph
from mo.ops.op import Op
from mo.graph.graph import Node
from mo.utils.unittest.graph import build_graph
-
nodes_attributes = {'input_indices': {'shape': None, 'value': None, 'kind': 'data'},
'input_values': {'shape': None, 'value': None, 'kind': 'data'},
'input_dense_shape': {'shape': None, 'value': None, 'kind': 'data'},
"""
import logging as log
+
import numpy as np
from mo.front.common.partial_infer.utils import int64_array
import numpy as np
+from extensions.ops.split import AttributedSplit, AttributedVariadicSplit
from mo.front.common.partial_infer.utils import int64_array
from mo.graph.graph import Node
-from extensions.ops.split import AttributedSplit, AttributedVariadicSplit
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
class TestSplitOp(unittest.TestCase):
from mo.front.common.partial_infer.elemental import copy_shape_infer
from mo.graph.graph import Graph
from mo.ops.op import Op
-from mo.front.common.partial_infer.utils import mark_input_bins
class StopGradientOp(Op):
from extensions.ops.switch import Switch
from mo.graph.graph import Node
-from mo.utils.unittest.graph import build_graph_with_edge_attrs, build_graph_with_attrs
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph_with_edge_attrs, build_graph_with_attrs
class TestSwitch(unittest.TestCase):
from mo.graph.graph import Node
from mo.utils.unittest.graph import build_graph
-
input_shape = np.array([1, 3, 224, 224])
limitations under the License.
"""
-import logging as log
-
-import networkx as nx
import numpy as np
from mo.graph.graph import Node, Graph
from mo.graph.graph import Node
from mo.utils.unittest.graph import build_graph
-
# graph 1 with two outputs: uniques and indices
nodes_attributes = {'input': {'shape': None, 'value': None, 'kind': 'data'},
'unique_node': {'op': 'Unique', 'kind': 'op'},
"""
import hashlib
-from defusedxml.minidom import parseString
from xml.etree.ElementTree import Element, SubElement, tostring
+from defusedxml.minidom import parseString
+
from mo.graph.graph import *
-from mo.middle.passes.convert_data_type import data_type_str_to_precision, np_data_type_to_precision
+from mo.middle.passes.convert_data_type import np_data_type_to_precision
from mo.utils.unsupported_ops import UnsupportedOps
from mo.utils.utils import refer_to_faq_msg
from mo.utils.version import get_version
"""
import logging as log
from builtins import AttributeError
+
from defusedxml import ElementTree
from mo.front.caffe.collect_attributes import collect_attributes
limitations under the License.
"""
-import logging as log
-
import numpy as np
from mo.utils.error import Error
import logging as log
import os
-import networkx as nx
import numpy as np
from mo.graph.graph import Node, Graph
# Concat infer : N - number of inputs to concat
# axis - dimension number for tensors concatenation
import numpy as np
-from mo.front.common.partial_infer.utils import int64_array
from mo.front.caffe.extractors.utils import get_canonical_axis_index
+from mo.front.common.partial_infer.utils import int64_array
from mo.ops.op import PermuteAttrs
"""
import logging as log
+
import numpy as np
-from mo.front.common.layout import get_batch_dim, get_features_dim, get_height_dim, get_width_dim, shape_for_layout
+from mo.front.common.layout import get_batch_dim, get_features_dim, shape_for_layout
from mo.graph.graph import Node
"""
import logging as log
+from typing import Iterable
import numpy as np
-from typing import Iterable
-
def int64_array(l: Iterable):
return np.array(l, dtype=np.int64)
def convert_tf_padding_to_str(padding):
- mapping = {b'SAME': 'same_upper', b'VALID': 'valid'}
- return mapping[padding.s]
+ mapping = {'SAME': 'same_upper', 'VALID': 'valid'}
+ return mapping[padding]
+
+
+def convert_deconv_tf_padding_to_str(padding):
+ # according to the formulas for calculating "auto_pad" values of the
+ # ConvBackpropData layer in the Operation Specification,
+ # the "same_lower" value matches to the "same" value for conv_transpose layer in TensorFlow
+ mapping = {'SAME': 'same_lower', 'VALID': 'valid'}
+ return mapping[padding]
# TODO eliminate this dependency and pass necessary function as an argument
normalized_stride = 1 / stride
if auto_pad in ['same_lower', 'same_upper']:
- if auto_pad == 'same_upper':
- output = np.int64(np.ceil(input / normalized_stride))
- else:
- output = np.int64(np.floor(input / normalized_stride))
+ output = np.int64(np.ceil(input / normalized_stride))
residual = input % stride
mask = residual == 0
full_pad = window.copy()
"""
import logging as log
-import networkx as nx
-
from mo.front.subgraph_matcher import SubgraphMatch
from mo.graph.graph import Node, merge_edge_props, Graph
from mo.middle.pattern_match import apply_pattern
add_output_ops
from mo.graph.graph import Node
from mo.utils.error import Error
+from mo.utils.ir_engine.compare_graphs import compare_graphs
from mo.utils.unittest.extractors import FakeMultiParam
from mo.utils.unittest.graph import build_graph, build_graph_with_edge_attrs, build_graph_with_attrs
-from mo.utils.ir_engine.compare_graphs import compare_graphs
class FakePythonParam:
See the License for the specific language governing permissions and
limitations under the License.
"""
-from mo.front.extractor import FrontExtractorOp
from extensions.ops.elementwise import Add
+from mo.front.extractor import FrontExtractorOp
class AddFrontExtractor(FrontExtractorOp):
from mo.front.caffe.extractors.utils import embed_input
from mo.front.extractor import FrontExtractorOp
-from mo.front.kaldi.loader.utils import read_binary_bool_token, read_binary_integer32_token, collect_until_token, read_binary_float_token
+from mo.front.kaldi.loader.utils import read_binary_bool_token, read_binary_integer32_token, collect_until_token, \
+ read_binary_float_token
from mo.front.kaldi.utils import read_binary_vector
from mo.ops.scale_shift import ScaleShiftOp
from mo.utils.error import Error
See the License for the specific language governing permissions and
limitations under the License.
"""
-from mo.front.extractor import FrontExtractorOp
from extensions.ops.identity import IdentityOp
+from mo.front.extractor import FrontExtractorOp
class ClipGradientComponentFrontExtractor(FrontExtractorOp):
import numpy as np
-from extensions.ops.transpose import Transpose
from extensions.ops.gather import Gather
+from extensions.ops.transpose import Transpose
from mo.front.common.partial_infer.utils import int64_array
from mo.front.common.replacement import FrontReplacementOp
from mo.front.kaldi.loader.utils import read_binary_integer32_token, read_blob
limitations under the License.
"""
from mo.front.extractor import FrontExtractorOp
-from mo.ops.eltwise_ninputs_in_1 import EltwiseNin1
from mo.front.kaldi.utils import read_token_value
+from mo.ops.eltwise_ninputs_in_1 import EltwiseNin1
class ElementwiseProductComponentFrontExtractor(FrontExtractorOp):
limitations under the License.
"""
+from extensions.ops.MatMul import FullyConnected
from mo.front.caffe.extractors.utils import embed_input
from mo.front.extractor import FrontExtractorOp
from mo.front.kaldi.loader.utils import collect_until_token
from mo.front.kaldi.utils import read_binary_matrix
-from extensions.ops.MatMul import FullyConnected
class LinearComponentFrontExtractor(FrontExtractorOp):
mapping_rule = {}
- embed_input(mapping_rule, 1, 'i_weights', ifo_x_weights[0:1024])
- embed_input(mapping_rule, 2, 'f_weights', ifo_x_weights[1024:2048])
- embed_input(mapping_rule, 3, 'o_weights', ifo_x_weights[2048:])
+ assert len(ifo_x_weights_shape) == 2, "Unexpected shape of weights in LSTMNonLinearityComponent"
+ assert ifo_x_weights_shape[0] == 3, "Unexpected shape of weights in LSTMNonLinearityComponent"
+
+ ifo_x_weights = ifo_x_weights.reshape(ifo_x_weights_shape)
+ embed_input(mapping_rule, 1, 'i_weights', ifo_x_weights[0][:])
+ embed_input(mapping_rule, 2, 'f_weights', ifo_x_weights[1][:])
+ embed_input(mapping_rule, 3, 'o_weights', ifo_x_weights[2][:])
LstmNonLinearity.update_node_stat(node, mapping_rule)
return cls.enabled
from mo.front.common.extractors.utils import layout_attrs
from mo.front.extractor import FrontExtractorOp
-from mo.front.kaldi.loader.utils import read_token_value, collect_until_whitespace, collect_until_token, \
+from mo.front.kaldi.loader.utils import read_token_value, collect_until_token, \
read_binary_integer32_token, find_next_tag, read_placeholder
from mo.ops.pooling import Pooling
from mo.utils.error import Error
import numpy as np
+from extensions.ops.normalize import NormalizeOp
from mo.front.caffe.extractors.utils import embed_input
from mo.front.extractor import FrontExtractorOp
from mo.front.kaldi.loader.utils import collect_until_token, read_binary_bool_token, read_binary_integer32_token, \
- read_binary_float_token
-from extensions.ops.normalize import NormalizeOp
+ read_binary_float_token
from mo.utils.error import Error
import numpy as np
from extensions.ops.normalize import NormalizeOp
-from mo.front.kaldi.extractors.normalize_component_ext import NormalizeComponentFrontExtractor
from mo.front.kaldi.extractors.common_ext_test import KaldiFrontExtractorTest
+from mo.front.kaldi.extractors.normalize_component_ext import NormalizeComponentFrontExtractor
from mo.front.kaldi.loader.utils_test import TestKaldiUtilsLoading
from mo.ops.op import Op
limitations under the License.
"""
+from extensions.ops.pnorm import PNormOp
from mo.front.extractor import FrontExtractorOp
from mo.front.kaldi.loader.utils import collect_until_token, read_binary_integer32_token, read_binary_float_token
-from extensions.ops.pnorm import PNormOp
from mo.utils.error import Error
import numpy as np
from extensions.ops.pnorm import PNormOp
-from mo.front.kaldi.extractors.pnorm_component_ext import PNormComponentFrontExtractor
from mo.front.kaldi.extractors.common_ext_test import KaldiFrontExtractorTest
+from mo.front.kaldi.extractors.pnorm_component_ext import PNormComponentFrontExtractor
from mo.front.kaldi.loader.utils_test import TestKaldiUtilsLoading
from mo.ops.op import Op
limitations under the License.
"""
-from mo.front.extractor import FrontExtractorOp
from extensions.ops.activation_ops import ReLU
+from mo.front.extractor import FrontExtractorOp
class RectifiedLinearComponentFrontExtractor(FrontExtractorOp):
limitations under the License.
"""
import io
-
-import numpy as np
+import logging as log
import struct
from io import IOBase
import networkx as nx
-import logging as log
+import numpy as np
from mo.front.kaldi.loader.utils import find_next_tag, read_placeholder, find_next_component, get_name_from_path, \
find_end_of_component, end_of_nnet_tag, read_binary_integer32_token, get_parameters, read_token_value, \
limitations under the License.
"""
import io
-import numpy as np
import struct
import unittest
+import numpy as np
+
from mo.front.kaldi.loader.loader import load_topology_map, load_components
from mo.graph.graph import Graph, Node
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
class TestKaldiModelsLoading(unittest.TestCase):
limitations under the License.
"""
import io
-
-import numpy as np
import os
import struct
+import numpy as np
+
from mo.utils.error import Error
from mo.utils.utils import refer_to_faq_msg
limitations under the License.
"""
import io
-import numpy as np
-import os
import logging as log
+import os
+
+import numpy as np
-from mo.front.kaldi.loader.utils import read_placeholder, read_binary_integer32_token, read_blob, read_token_value, find_next_tag
+from mo.front.kaldi.loader.utils import read_placeholder, read_binary_integer32_token, read_blob, read_token_value, \
+ find_next_tag
from mo.utils.error import Error
"""
from mo.front.common.partial_infer.multi_box_prior import multi_box_prior_infer_mxnet
-from mo.utils.error import Error
def multi_box_prior_ext(attr):
import mxnet as mx
+from extensions.ops.elementwise import Elementwise
from mo.graph.graph import Node, Graph
from mo.ops.const import Const
-from extensions.ops.elementwise import Elementwise
from mo.utils.error import Error
from mo.utils.str_to import StrTo
from mo.utils.utils import refer_to_faq_msg
limitations under the License.
"""
-import os
import json
+import logging as log
+import os
-import numpy as np
import mxnet as mx
-import logging as log
+import numpy as np
-from mo.front.mxnet.extractors.utils import get_mxnet_node_edges, load_params, init_rnn_states
from mo.front.mxnet.extractor import common_mxnet_fields
+from mo.front.mxnet.extractors.utils import get_mxnet_node_edges, load_params, init_rnn_states
from mo.front.mxnet.nd_to_params import build_params_file
from mo.graph.graph import Node, Graph
from mo.utils.error import Error
import os
import mxnet as mx
+
from mo.front.mxnet.extractors.utils import load_params
import logging as log
-import networkx as nx
import onnx
from mo.graph.graph import fill_graph_with_nodes, Graph
limitations under the License.
"""
-import numpy as np
import unittest
+import numpy as np
+
from mo.front.tf.extractors.utils import collect_tf_attrs, tf_tensor_content
from mo.utils.unittest.extractors import PB
"""
import collections
import logging as log
-from typing import List
-
from copy import deepcopy
+from typing import List
import networkx as nx
import numpy as np
import unittest
import numpy as np
-
from generator import generator, generate
from mo.graph.graph import Node, Graph, add_opoutput, dict_includes_compare_attrs
from mo.ops.const import Const
from mo.utils.error import Error
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
nodes = {
'0': {'name': 'input1', 'type': 'Identity', 'value': None, 'kind': 'op', 'op': 'Parameter'},
from collections import OrderedDict
import numpy as np
-from mo.pipeline.unified import unified_pipeline
from extensions.back.SpecialNodesFinalization import RemoveConstOps, CreateConstNodesReplacement, RemoveOutputOps, \
NormalizeTI
from mo.graph.graph import Graph
from mo.middle.pattern_match import for_graph_and_each_sub_graph_recursively, for_each_sub_graph_recursively
from mo.pipeline.common import prepare_emit_ir, get_ir_version
+from mo.pipeline.unified import unified_pipeline
from mo.utils import import_extensions
from mo.utils.cli_parser import get_placeholder_shapes, get_tuple_values, get_model_name, \
get_common_cli_options, get_caffe_cli_options, get_tf_cli_options, get_mxnet_cli_options, get_kaldi_cli_options, \
from mo.graph.graph import Node
from mo.middle.passes.conv import convert_muladd_to_scaleshift, convert_add_or_mul_to_scaleshift
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
nodes_attributes = {
'placeholder_1': {'shape': None, 'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
See the License for the specific language governing permissions and
limitations under the License.
"""
-import re
import logging as log
+import re
from collections import deque
import networkx as nx
import numpy as np
-from mo.graph.graph import Node, Graph
+from mo.graph.graph import Node
from mo.middle.passes.eliminate import mark_output_reachable_nodes, mark_const_producer_nodes
from mo.utils.unittest.graph import build_graph
import numpy as np
from mo.middle.passes.fusing.decomposition import convert_scale_shift_to_mul_add, convert_batch_norm
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
nodes_attributes = {
'placeholder_1': {'shape': None, 'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
from mo.front.common.partial_infer.eltwise import eltwise_infer
from mo.graph.graph import Node
from mo.middle.passes.fusing.fuse_linear_ops import _fuse_mul, _fuse_add, fuse_linear_ops
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
nodes_attributes = {
'placeholder_1': {'shape': None, 'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
import numpy as np
-from mo.ops.const import Const
from extensions.ops.elementwise import Mul, Add
from mo.graph.graph import Node, Graph
from mo.middle.passes.fusing.helpers import get_value_in_port, \
get_tensor_in_port
+from mo.ops.const import Const
def _fuse_linear_sequence(graph: Graph, start_node: Node):
import numpy as np
from mo.middle.passes.fusing.fuse_linear_seq import fuse_mul_add_sequence
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
nodes_attributes = {
'placeholder_1': {'shape': None, 'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
from mo.middle.passes.fusing.resnet_optimization import stride_optimization
from mo.ops.convolution import Convolution
from mo.ops.pooling import Pooling
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
max_elt_lambda = lambda node: eltwise_infer(node, lambda a, b: np.maximum(a, b))
"""
import logging as log
+
import networkx as nx
import numpy as np
import logging as log
-import numpy as np
-
from extensions.ops.activation_ops import LeakyReLU
from mo.graph.graph import Graph
from mo.middle.pattern_match import apply_pattern
import numpy as np
from mo.middle.passes.mean_scale_values import move_scaleshift_to_preprocess
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
nodes_attributes = {'node_1': {'type': 'Identity', 'value': None, 'kind': 'op'},
'node_2': {'type': 'Identity', 'value': None, 'kind': 'op'},
"""
-from defusedxml.minidom import parseString
from xml.etree.ElementTree import Element, SubElement, tostring
+from defusedxml.minidom import parseString
+
from mo.graph.graph import Node, Graph
def backend_attrs(self):
if self.ir_version == 10:
+ def pad_attribute_helper(node: Node, pad_type: str='begin'):
+ assert pad_type in ['begin', 'end']
+ if not node.has_valid('pad'):
+ return None
+ pad = get_backend_pad(node.pad, node.spatial_dims, 0 if pad_type == 'begin' else 1)
+ if node.has_valid('auto_pad'):
+ pad = [0 for _ in pad]
+ return ','.join(map(str, pad))
+
return [
'auto_pad',
('strides', lambda node: ','.join(map(str, node['stride'][node.spatial_dims]))),
('dilations', lambda node: ','.join(map(str, node['dilation'][node.spatial_dims]))),
- ('pads_begin', lambda node: ','.join(map(str, get_backend_pad(node.pad, node.spatial_dims, 0))) if node.has_valid('pad') else None),
- ('pads_end', lambda node: ','.join(map(str, get_backend_pad(node.pad, node.spatial_dims, 1))) if node.has_valid('pad') else None),
+ ('pads_begin', lambda node: pad_attribute_helper(node, 'begin')),
+ ('pads_end', lambda node: pad_attribute_helper(node, 'end')),
('output_padding', lambda node: ','.join(map(str, node.output_padding[node.spatial_dims])) \
if node.has_valid('output_padding') else None),
limitations under the License.
"""
-import numpy as np
import logging as log
+import numpy as np
+
from mo.graph.graph import Graph
from mo.ops.op import Op
class Softmax(Op):
op = 'SoftMax'
- enabled = True
+ enabled = False
def __init__(self, graph: Graph, attrs: dict):
super().__init__(graph, {
copy_shape_infer(node)
PermuteAttrs.create_permute_attrs(node, attrs=[('axis', 'input:0')])
+
+class LogSoftmax(Op):
+ op = 'LogSoftmax'
+ enabled = False
+
+ def __init__(self, graph: Graph, attrs: dict):
+ super().__init__(graph, {
+ 'infer': None,
+ 'kind': 'op',
+ 'axis': 1,
+ 'type': None, # the operation will be replaced with a Log(Softmax(x)) sub-graph
+ 'op': __class__.op,
+ 'in_ports_count': 1,
+ 'out_ports_count': 1,
+ }, attrs)
limitations under the License.
"""
+import numpy as np
+
from mo.front.common.partial_infer.utils import int64_array
from mo.graph.perm_inputs import PermuteInputs
from mo.ops.op import Op
-import numpy as np
class SpaceToBatch(Op):
import numpy as np
-from mo.front.common.partial_infer.utils import int64_array
from mo.graph.graph import Node
-from mo.ops.op import PermuteAttrs
from mo.ops.squeeze import Squeeze
from mo.utils.unittest.graph import build_graph
if not node.has_valid(attr):
return None
- node[attr] = permute_array_with_ellipsis(node, node[attr], attr in ['begin_mask', 'end_mask'])
+ node[attr] = permute_array_with_ellipsis(node, node[attr], 0)
return node[attr]
('begin_mask', 'input:0', permute_masks),
('end_mask', 'input:0', permute_masks),
])
- for i in range(1, len(node.in_nodes())):
- if node.in_node(i).value is not None and len(node.in_node(0).shape) > 3:
- node.in_node(i).value = permute_array_with_ellipsis(node, node.in_node(i).value, 0)
+ # extend inputs according to ellipsis mask
+ in_shape = node.in_port(0).get_source().data.get_shape()
+ assert in_shape is not None, \
+ 'Input shape is unknown for 0 input of node {}'.format(node.name)
+ input_rank = len(in_shape)
+ if input_rank > 3:
+ for i_port in node.in_ports().values():
+ if i_port.idx == 0 or i_port.disconnected():
+ continue
+ old_value = i_port.data.get_value()
+ # additional check for non-const input
+ # error will be return in shape inference if non-const will be added
+ # it is paranoid check for case if shape inference will be changed
+ assert old_value is not None, \
+ '{} input of {} node is not constant: \'value\' attribute for edge ' + \
+ 'contains None'.format(i_port.idx, node.name)
+ # insert 0 for begin and end and 1 for stride
+ new_value = permute_array_with_ellipsis(node, old_value, int(i_port.idx == 3))
+ # set_value additionally set_shape and propagate value to Const node
+ i_port.data.set_value(new_value)
# extend masks before removing ellipsis
if np.any(node.ellipsis_mask):
for attr in ["new_axis_mask", "shrink_axis_mask", "begin_mask", "end_mask"]:
node[attr] = int64_array(extend_mask_according_ellipsis(node.ellipsis_mask, node.shrink_axis_mask,
len(node.out_port(0).data.get_shape()),
- list(node[attr]),
- attr in ["begin_mask", "end_mask"]))
+ list(node[attr]), 0))
# due to permutation from nhwc to nchw we will extend all masks and inputs
idx = np.nonzero(node.ellipsis_mask)
from mo.ops.op import PermuteAttrs
from mo.ops.strided_slice import extend_mask_according_ellipsis, permute_masks, permute_array_with_ellipsis, \
StridedSlice
+from mo.utils.error import Error
from mo.utils.unittest.graph import build_graph
nodes_attributes = {
+ 'input': {
+ 'kind': 'op',
+ 'op': None
+ },
'data_1': {
'kind': 'data',
'shape': None,
'value': None,
},
'begin': {
+ 'kind': 'op',
+ 'op': 'Const',
+ 'value': None,
+ 'shape': None
+ },
+ 'begin_data': {
'kind': 'data',
'shape': None,
'value': np.array([]),
},
'end': {
+ 'kind': 'op',
+ 'op': 'Const',
+ 'value': None,
+ 'shape': None
+ },
+ 'end_data': {
'kind': 'data',
'shape': None,
'value': np.array([]),
},
'stride': {
+ 'kind': 'op',
+ 'op': 'Const',
+ 'value': None,
+ 'shape': None
+ },
+ 'stride_data': {
'kind': 'data',
'shape': None,
'value': np.array([]),
slice_node = Node(graph, 'strided_slice')
permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 3, 1, 2], inv=[0, 2, 3, 1]), 'begin_mask')
- self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([1, 1, 0, 0])))
+ self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([1, 0, 0, 0])))
permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 3, 1, 2], inv=[0, 2, 3, 1]), 'end_mask')
- self.assertTrue(np.array_equal(slice_node.end_mask, np.array([0, 1, 1, 0])))
+ self.assertTrue(np.array_equal(slice_node.end_mask, np.array([0, 0, 1, 0])))
def test_permute_begin_end_long(self):
# Testing constant path case
slice_node = Node(graph, 'strided_slice')
permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 4, 1, 2, 3], inv=[0, 2, 3, 4, 1]), 'begin_mask')
- self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([1, 1, 0, 0, 1])))
+ self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([1, 0, 0, 0, 0])))
permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 4, 1, 2, 3], inv=[0, 2, 3, 4, 1]), 'end_mask')
- self.assertTrue(np.array_equal(slice_node.end_mask, np.array([0, 1, 1, 0, 1])))
+ self.assertTrue(np.array_equal(slice_node.end_mask, np.array([0, 0, 1, 0, 0])))
def test_permute_begin_end_shrink(self):
# Testing constant path case
slice_node = Node(graph, 'strided_slice')
permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 3, 1, 2], inv=[0, 2, 3, 1]), 'begin_mask')
- self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([1, 1, 0, 0])))
+ self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([1, 0, 0, 0])))
permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 3, 1, 2], inv=[0, 2, 3, 1]), 'end_mask')
- self.assertTrue(np.array_equal(slice_node.end_mask, np.array([0, 1, 1, 0])))
+ self.assertTrue(np.array_equal(slice_node.end_mask, np.array([0, 0, 1, 0])))
def test_permute_begin_end_ellipsis(self):
# Testing constant path case
graph = build_graph(nodes_attributes,
- [('data_1', 'strided_slice'),
- ('begin', 'strided_slice'),
- ('end', 'strided_slice'),
- ('stride', 'strided_slice'),
+ [('input', 'data_1'),
+ ('data_1', 'strided_slice'),
+ ('begin', 'begin_data'),
+ ('begin_data', 'strided_slice'),
+ ('end', 'end_data'),
+ ('end_data', 'strided_slice'),
+ ('stride', 'stride_data'),
+ ('stride_data', 'strided_slice'),
('strided_slice', 'data_2')],
{'data_1': {'shape': np.array([1, 2, 3, 4]), 'value': None},
+ 'begin': {'value': [0, 1], 'shape': [2]},
+ 'end': {'value': [1, 0], 'shape': [2]},
+ 'stride': {'value': [1, 2], 'shape': [2]},
'strided_slice': {'begin_mask': np.array([0, 0]), 'end_mask': np.array([1, 0]),
'new_axis_mask': np.array([0]), 'shrink_axis_mask': [0],
'ellipsis_mask': np.array([1, 0])},
slice_node = Node(graph, 'strided_slice')
permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 3, 1, 2], inv=[0, 2, 3, 1]), 'begin_mask')
- self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([0, 0, 1, 1])))
+ self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([0, 0, 0, 0])))
permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 3, 1, 2], inv=[0, 2, 3, 1]), 'end_mask')
- self.assertTrue(np.array_equal(slice_node.end_mask, np.array([1, 0, 1, 1])))
+ self.assertTrue(np.array_equal(slice_node.end_mask, np.array([1, 0, 0, 0])))
def test_permute_begin_end_ellipsis_infer(self):
# Testing constant path case
graph = build_graph(nodes_attributes,
- [('data_1', 'strided_slice'),
- ('begin', 'strided_slice'),
- ('end', 'strided_slice'),
- ('stride', 'strided_slice'),
+ [('input', 'data_1'),
+ ('data_1', 'strided_slice', {'in': 0}),
+ ('begin', 'begin_data'),
+ ('begin_data', 'strided_slice', {'in': 1}),
+ ('end', 'end_data'),
+ ('end_data', 'strided_slice', {'in': 2}),
+ ('stride', 'stride_data'),
+ ('stride_data', 'strided_slice', {'in': 3}),
('strided_slice', 'data_2')],
{'data_1': {'shape': np.array([1, 2, 3, 4]), 'value': None},
+ 'begin': {'value': [0, 1], 'shape': [2]},
+ 'end': {'value': [1, 0], 'shape': [2]},
+ 'stride': {'value': [1, 2], 'shape': [2]},
+ 'begin_data': {'value': [0, 1], 'shape': [2]},
+ 'end_data': {'value': [1, 0], 'shape': [2]},
+ 'stride_data': {'value': [1, 2], 'shape': [2]},
'strided_slice': {'begin_mask': np.array([0, 0]), 'end_mask': np.array([1, 0]),
'new_axis_mask': np.array([0]), 'shrink_axis_mask': [0],
'ellipsis_mask': np.array([1, 0])},
graph.graph['layout'] = "NHWC"
slice_node = Node(graph, 'strided_slice')
+ begin_node = Node(graph, 'begin')
+ end_node = Node(graph, 'end')
+ stride_node = Node(graph, 'stride')
StridedSlice.infer(slice_node)
- self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([0, 1, 1, 0])))
- self.assertTrue(np.array_equal(slice_node.end_mask, np.array([1, 1, 1, 0])))
+ self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([0, 0, 0, 0])))
+ self.assertTrue(np.array_equal(slice_node.end_mask, np.array([1, 0, 0, 0])))
self.assertTrue(np.array_equal(slice_node.shrink_axis_mask, np.array([0, 0, 0, 0])))
self.assertTrue(np.array_equal(slice_node.new_axis_mask, np.array([0, 0, 0, 0])))
+ self.assertTrue(np.array_equal(begin_node.value, np.array([0, 1, 0, 0])))
+ self.assertTrue(np.array_equal(end_node.value, np.array([1, 0, 0, 0])))
+ self.assertTrue(np.array_equal(stride_node.value, np.array([1, 2, 1, 1])))
def test_permute_begin_end_ellipsis_new(self):
# Testing constant path case
graph = build_graph(nodes_attributes,
- [('data_1', 'strided_slice'),
- ('begin', 'strided_slice'),
- ('end', 'strided_slice'),
- ('stride', 'strided_slice'),
+ [('input', 'data_1'),
+ ('data_1', 'strided_slice', {'in': 0}),
+ ('begin', 'begin_data'),
+ ('begin_data', 'strided_slice', {'in': 1}),
+ ('end', 'end_data'),
+ ('end_data', 'strided_slice', {'in': 2}),
+ ('stride', 'stride_data'),
+ ('stride_data', 'strided_slice', {'in': 3}),
('strided_slice', 'data_2')],
{'data_1': {'shape': np.array([1, 2, 3, 4]), 'value': None},
- 'strided_slice': {'begin_mask': np.array([0, 0, 0]), 'end_mask': np.array([1, 0, 0]),
+ 'begin': {'value': [0, 1, 0], 'shape': [3]},
+ 'begin_data': {'value': [0, 1, 0], 'shape': [3]},
+ 'end': {'value': [1, 0, 1], 'shape': [3]},
+ 'end_data': {'value': [1, 0, 1], 'shape': [3]},
+ 'stride': {'value': [1, 2, 3], 'shape': [3]},
+ 'stride_data': {'value': [1, 2, 3], 'shape': [3]},
+ 'strided_slice': {'begin_mask': np.array([1, 2, 3]), 'end_mask': np.array([1, 2, 3]),
'new_axis_mask': np.array([1, 0, 0]), 'shrink_axis_mask': [0],
'ellipsis_mask': np.array([0, 1, 0])},
'data_2': {'shape': np.array([1, 1, 2, 3, 4]), 'value': None},
slice_node = Node(graph, 'strided_slice')
permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 4, 1, 2, 3], inv=[0, 2, 3, 4, 1]), 'begin_mask')
- self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([0, 0, 0, 1, 1])))
+ self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([1, 3, 2, 0, 0])))
permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 4, 1, 2, 3], inv=[0, 2, 3, 4, 1]), 'end_mask')
- self.assertTrue(np.array_equal(slice_node.end_mask, np.array([1, 0, 0, 1, 1])))
+ self.assertTrue(np.array_equal(slice_node.end_mask, np.array([1, 3, 2, 0, 0])))
def test_permute_begin_end_ellipsis_new_inputs(self):
# Testing constant path case
graph = build_graph(nodes_attributes,
- [('data_1', 'strided_slice'),
- ('begin', 'strided_slice'),
- ('end', 'strided_slice'),
- ('stride', 'strided_slice'),
+ [('input', 'data_1'),
+ ('data_1', 'strided_slice', {'in': 0}),
+ ('begin', 'begin_data'),
+ ('begin_data', 'strided_slice', {'in': 1}),
+ ('end', 'end_data'),
+ ('end_data', 'strided_slice', {'in': 2}),
+ ('stride', 'stride_data'),
+ ('stride_data', 'strided_slice', {'in': 3}),
('strided_slice', 'data_2')],
{'data_1': {'shape': np.array([1, 2, 3, 4]), 'value': None},
'strided_slice': {'begin_mask': np.array([0, 0, 0]), 'end_mask': np.array([1, 0, 0]),
'begin': {'value': np.array([0, 1, 2])},
'end': {'value': np.array([1, 2, 3])},
'stride': {'value': np.array([1, 1, 1])},
+ 'begin_data': {'value': np.array([0, 1, 2])},
+ 'end_data': {'value': np.array([1, 2, 3])},
+ 'stride_data': {'value': np.array([1, 1, 1])},
'data_2': {'shape': np.array([1, 1, 2, 3, 4]), 'value': None},
})
shrink_mask = extend_mask_according_ellipsis(ellipsis_mask, shrink_mask, length_shape, list(shrink_mask),
ins_value)
self.assertEquals(shrink_mask, [0, 0, 2, 2, 1])
+
+ def test_non_const_infer(self):
+ # Testing constant path case
+ graph = build_graph(nodes_attributes,
+ [('input', 'data_1'),
+ ('data_1', 'strided_slice', {'in': 0}),
+ ('data_1', 'strided_slice', {'in': 1}),
+ ('end', 'end_data'),
+ ('end_data', 'strided_slice', {'in': 2}),
+ ('stride', 'stride_data'),
+ ('stride_data', 'strided_slice', {'in': 3}),
+ ('strided_slice', 'data_2')],
+ {'data_1': {'shape': np.array([1, 2, 3, 4]), 'value': None},
+ 'end': {'value': [1, 0], 'shape': [2]},
+ 'stride': {'value': [1, 2], 'shape': [2]},
+ 'strided_slice': {'begin_mask': np.array([0, 0]), 'end_mask': np.array([1, 0]),
+ 'new_axis_mask': np.array([0]), 'shrink_axis_mask': [0],
+ 'ellipsis_mask': np.array([1, 0])},
+ 'data_2': {'shape': np.array([1, 2, 3, 4]), 'value': None},
+ })
+ graph.graph['layout'] = "NHWC"
+
+ slice_node = Node(graph, 'strided_slice')
+ with self.assertRaises(Error) as error:
+ StridedSlice.infer(slice_node)
+ self.assertTrue('Strided slice layer supports only constant begin and end inputs' in str(error.exception))
from mo.front.common.partial_infer.utils import int64_array
from mo.graph.graph import Node
from mo.ops.unsqueeze import Unsqueeze
-from mo.utils.unittest.graph import build_graph
from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
@generator
limitations under the License.
"""
import logging as log
-
-import networkx as nx
import os
from enum import Enum
+import networkx as nx
+
from mo.graph.graph import Graph
from mo.middle.passes.eliminate import shape_inference
from mo.middle.pattern_match import for_graph_and_each_sub_graph_recursively
import os
from re import compile, match
-import networkx as nx
-
from mo.graph.graph import Node, Graph
from mo.utils.error import Error
from mo.utils.graph import nodes_matching_name_pattern, sub_graph_between_nodes
limitations under the License.
"""
+import logging as log
from collections import deque
from re import match, compile
-import logging as log
import networkx as nx
from mo.graph.graph import Node, Graph
import unittest
-import networkx as nx
-
+from mo.graph.graph import Graph
from mo.utils.error import Error
from mo.utils.graph import bfs_search, is_connected_component, sub_graph_between_nodes
-from mo.graph.graph import Graph
+
class TestGraphUtils(unittest.TestCase):
def test_simple_dfs(self):
"""
import logging as log
-from mo.utils.graph import Node
-from mo.utils import class_registration
from mo.front.common.partial_infer.utils import int64_array
+from mo.utils import class_registration
+from mo.utils.graph import Node
class Extender(object):
--- /dev/null
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.front.common.partial_infer.utils import mark_input_bins
+from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
+
+
+class GRUCell_extender(Extender):
+ op = 'GRUCell'
+
+ @staticmethod
+ def extend(op: Node):
+ if not op.has_valid('activations'):
+ op['activations'] = None
+
+ mark_input_bins(op, start_port=2)
+
+ op['need_copy_input_blobs'] = True
limitations under the License.
"""
-from mo.utils.ir_reader.extender import Extender
from mo.utils.graph import Node
-
-from mo.front.common.partial_infer.utils import int64_array
+from mo.utils.ir_reader.extender import Extender
class LSTMCell_extender(Extender):
--- /dev/null
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
+
+
+class RNNCell_extender(Extender):
+ op = 'RNNCell'
+
+ @staticmethod
+ def extend(op: Node):
+ if not op.has_valid('activations'):
+ op['activations'] = None
limitations under the License.
"""
+from mo.utils.graph import Node
from mo.utils.ir_reader.extender import Extender
from mo.utils.ir_reader.extenders.conv_extender import Conv_extender
-from mo.utils.graph import Node
class BinaryConv_extender(Extender):
limitations under the License.
"""
-from mo.utils.ir_reader.extender import Extender
-from mo.utils.graph import Node
-
from mo.front.common.partial_infer.utils import int64_array
+from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
class Conv_extender(Extender):
import numpy as np
-from mo.utils.ir_reader.extender import Extender
-from mo.utils.graph import Node
-
from mo.front.common.partial_infer.utils import int64_array
+from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
class ConvolutionBackpropData_extender(Extender):
limitations under the License.
"""
+from mo.utils.graph import Node
from mo.utils.ir_reader.extender import Extender
from mo.utils.ir_reader.extenders.conv_extender import Conv_extender
-from mo.utils.graph import Node
class DeformableConv_extender(Extender):
limitations under the License.
"""
-from mo.utils.ir_reader.extender import Extender
from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
class ExperimentalDetectronROIFeatureExtractor_extender(Extender):
limitations under the License.
"""
-from mo.utils.ir_reader.extender import Extender
from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
class FakeQuantize_extender(Extender):
limitations under the License.
"""
-from mo.utils.ir_reader.extender import Extender
from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
class Interpolate_extender(Extender):
--- /dev/null
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+from mo.middle.passes.convert_data_type import destination_type_to_np_data_type
+
+from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
+
+
+class NonZeroExtender(Extender):
+ op = 'NonZero'
+
+ @staticmethod
+ def extend(op: Node):
+ op['output_type'] = destination_type_to_np_data_type(op.output_type)
limitations under the License.
"""
-from mo.utils.ir_reader.extender import Extender
from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
class Pad_extender(Extender):
limitations under the License.
"""
-from mo.utils.ir_reader.extender import Extender
+from mo.middle.passes.convert_data_type import destination_type_to_np_data_type
from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
-from mo.middle.passes.convert_data_type import destination_type_to_np_data_type
class Parameter_extender(Extender):
op = 'Parameter'
limitations under the License.
"""
-from mo.utils.ir_reader.extender import Extender
-from mo.graph.graph import Node
-
from mo.front.common.partial_infer.utils import int64_array
+from mo.graph.graph import Node
+from mo.utils.ir_reader.extender import Extender
class AvgPool_extender(Extender):
limitations under the License.
"""
+from mo.utils.graph import Node
from mo.utils.ir_reader.extender import Extender
from mo.utils.ir_reader.extenders.priorbox_extender import PriorBox_extender
-from mo.utils.graph import Node
class PriorBoxClustered_extender(Extender):
limitations under the License.
"""
-from mo.utils.ir_reader.extender import Extender
-from mo.utils.graph import Node
-
from mo.front.common.partial_infer.multi_box_prior import multi_box_prior_infer_mxnet
+from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
class PriorBox_extender(Extender):
limitations under the License.
"""
-from mo.utils.ir_reader.extender import Extender
-from mo.utils.graph import Node
-
from mo.front.common.partial_infer.utils import int64_array
+from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
class ReorgYolo_extender(Extender):
limitations under the License.
"""
-from mo.utils.ir_reader.extender import Extender
-from mo.utils.graph import Node
-
from mo.front.common.partial_infer.utils import int64_array
+from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
class StridedSlice_extender(Extender):
limitations under the License.
"""
-from mo.utils.ir_reader.extender import Extender
from mo.utils.graph import Node
-
+from mo.utils.ir_reader.extender import Extender
from mo.utils.ir_reader.layer_to_class import copy_graph_with_ops
limitations under the License.
"""
-from mo.utils.ir_reader.extender import Extender
from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
class TopK_extender(Extender):
limitations under the License.
"""
-from mo.utils.ir_reader.extender import Extender
from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
class VariadicSplit_extender(Extender):
from extensions.ops.activation_ops import Activation
from extensions.ops.elementwise import Elementwise, LogicalElementwise, BiasAdd, Div, Mul, Pow, Sub
from extensions.ops.psroipooling import DeformablePSROIPoolingOp
+from extensions.ops.scatter import Scatter
from extensions.ops.split import Split, VariadicSplit
from mo.front.common.partial_infer.utils import int64_array
from mo.graph.graph import Graph, Node
"""
import_by_path(os.path.join(path, 'mo', 'ops'), ['mo', 'ops'])
import_by_path(os.path.join(path, 'extensions', 'ops'), ['extensions', 'ops'])
- update_registration(classes=[Op, Activation, Elementwise, LogicalElementwise, ReduceOp],
+ update_registration(classes=[Op, Activation, Elementwise, LogicalElementwise, ReduceOp, Scatter],
enabled_transforms=[], disabled_transforms=[])
if weights_rounded[elem] == 0:
weights_rounded[elem] -= 1 # pylint: disable=unsupported-assignment-operation
assert len(weights_rounded) % 8 == 0
- weights_rounded = weights_rounded.reshape([len(weights_rounded) // 8, 8]) # pylint: disable=no-member
+ weights_rounded = weights_rounded.reshape([len(weights_rounded) // 8, 8]) # pylint: disable=no-member
weights_rounded = np.flip(weights_rounded, axis=1)
value = weights_rounded.flatten()
:param op:
:return:
"""
- assert op.soft_get('type') == 'GroupConvolution', 'Wrong operation type, {} instead of GroupConvolution!' \
- ''.format(op.soft_get('type'))
+ assert op.soft_get('type') == 'GroupConvolution', \
+ 'Wrong operation type, {} instead of GroupConvolution!'.format(op.soft_get('type'))
weights_shape = op.in_port(1).data.get_shape()
group = weights_shape[0]
weights_node.value = np.reshape(weights_node.value, new_shape)
elif weights_node.type == 'Reshape':
# we remove reshape node added in ConvolutionWithGroupsResolver pass
- assert weights_node.in_port(0).get_source().data.get_shape() == new_shape, 'Weight shape and calculated ' \
- 'shape mismatch in GroupConv node {}.'.format(op.name)
+ assert weights_node.in_port(0).get_source().data.get_shape() == new_shape, \
+ 'Weight shape and calculated shape mismatch in GroupConv node {}.'.format(op.name)
op.in_port(1).disconnect()
weights_node.in_port(0).get_source().get_connection().set_destination(op.in_port(1))
else:
- assert op.in_port(1).get_source().data.get_shape() == new_shape, 'Weight shape and calculated ' \
- 'shape mismatch in GroupConv node {}.'.format(op.name)
+ assert op.in_port(1).get_source().data.get_shape() == new_shape, \
+ 'Weight shape and calculated shape mismatch in GroupConv node {}.'.format(op.name)
# we need to set this attrs for correct shape infer as convolution
op['group'] = group
op.type = 'Convolution'
:param op:
:return:
"""
- assert op.soft_get('type') in ('ConvolutionBackpropData', 'GroupConvolutionBackpropData'),\
+ assert op.soft_get('type') in ('ConvolutionBackpropData', 'GroupConvolutionBackpropData'), \
'Wrong operation type, {} instead of ConvolutionBackpropData/GroupConvolutionBackpropData!' \
''.format(op.soft_get('type'))
i += 1
+def copy_input_blobs(op: Node, copy_op: Node):
+ """
+ Function copy input blob data nodes from restored graph to copied one
+ :param op: Node from restored graph
+ :param copy_op: Node from copied graph
+ :return:
+ """
+ for u, d in op.get_sorted_inputs():
+ if 'bin' in d:
+ Op.create_and_connect_input_data_node(copy_op.graph, copy_op,
+ {'value': op.in_node(d['in']).value,
+ 'shape': op.in_node(d['in']).shape}, d)
+
+
# Map with preprocessing functions
preprocessing_op_nodes = {
'Const': propagate_const_values,
'please check it!'.format(op_type)
node = Op.get_op_class_by_name(op_type)(new_graph, op.attrs()).create_node()
+ if op.has_and_set('need_copy_input_blobs'):
+ copy_input_blobs(op, node)
+
# Collect node connections
mapping_of_old_idx_into_new[op.id] = node.id
node_connections[op.id] = collect_node_outputs(op)
from mo.utils.error import Error
from mo.utils.simple_proto_parser import SimpleProtoParser
-
# The list of rules how to map the value from the pipeline.config file to the dictionary with attributes.
# The rule is either a string or a tuple with two elements. In the first case the rule string is used as a key to
# search in the parsed pipeline.config file attributes dictionary and a key to save found value. In the second case the
limitations under the License.
"""
-import networkx as nx
-
from mo.graph.graph import Graph
from mo.middle.pattern_match import apply_pattern
limitations under the License.
"""
from argparse import Namespace
+from copy import deepcopy
import networkx as nx
-from copy import deepcopy
from mo.front.common.partial_infer.utils import int64_array
from mo.front.extractor import extract_port_from_string
import collections
-import networkx as nx
-
from mo.graph.graph import Node, Graph
"""
import unittest
-import networkx as nx
+
import numpy as np
+
from mo.utils.utils import match_shapes
import unittest
import unittest.mock as mock
-
from unittest.mock import mock_open
+
from mo.utils.versions_checker import get_module_version_list_from_file, parse_versions_list
+
class TestingVersionsChecker(unittest.TestCase):
@mock.patch('builtins.open', new_callable=mock_open, create=True)
def test_get_module_version_list_from_file(self, mock_open):
coverage==4.4.2
m2r==0.1.12
pyenchant==1.6.11
+astroid==2.1.0
pylint==2.1.1
Sphinx==1.6.5
safety==1.8.5
-Subproject commit edc65ca0111f86a7e63a98f62cb17d153cc2535c
+Subproject commit eaa6d35b7ed415e02b2401b528f31960123e5b71
--- /dev/null
+=====================================================
+Demo Scripts for Model Optimizer and Inference Engine
+=====================================================
+
+The demo scripts illustrate Intel(R) Deep Learning Deployment Toolkit usage to convert and optimize pre-trained models and perform inference.
+
+Setting Up Demos
+================
+If you are behind a proxy, set the following environment variables in the console session:
+
+On Linux* and Mac OS:
+export http_proxy=http://<proxyHost>:<proxyPort>
+export https_proxy=https://<proxyHost>:<proxyPort>
+
+On Windows* OS:
+set http_proxy=http://<proxyHost>:<proxyPort>
+set https_proxy=https://<proxyHost>:<proxyPort>
+
+Running Demos
+=============
+
+The "demo" folder contains three scripts:
+
+1. Classification demo using public SqueezeNet topology (demo_squeezenet_download_convert_run.sh|bat)
+
+2. Security barrier camera demo that showcases three models coming with the product (demo_squeezenet_download_convert_run.sh|bat)
+
+3. Benchmark demo using public SqueezeNet topology (demo_benchmark_app.sh|bat)
+
+To run the demos, run demo_squeezenet_download_convert_run.sh or demo_security_barrier_camera.sh or demo_benchmark_app.sh (*.bat on Windows) scripts from the console without parameters, for example:
+
+./demo_squeezenet_download_convert_run.sh
+
+The script allows to specify the target device to infer on using -d <CPU|GPU|MYRIAD|FPGA> option.
+
+Classification Demo Using SqueezeNet
+====================================
+
+The demo illustrates the general workflow of using the Intel(R) Deep Learning Deployment Toolkit and performs the following:
+
+ - Downloads a public SqueezeNet model using the Model Downloader (open_model_zoo\tools\downloader\downloader.py)
+ - Installs all prerequisites required for running the Model Optimizer using the scripts from the "model_optimizer\install_prerequisites" folder
+ - Converts SqueezeNet to an IR using the Model Optimizer (model_optimizer\mo.py) via the Model Converter (open_model_zoo\tools\downloader\converter.py)
+ - Builds the Inference Engine classification_sample (inference_engine\samples\classification_sample)
+ - Runs the sample with the car.png picture located in the demo folder
+
+The sample application prints top-10 inference results for the picture.
+
+For more information about the Inference Engine classification sample, refer to the documentation available in the sample folder.
+
+
+Security Barrier Camera Demo
+============================
+
+The demo illustrates using the Inference Engine with pre-trained models to perform vehicle detection, vehicle attributes and license-plate recognition tasks.
+As the sample produces visual output, it should be run in GUI mode.
+
+The demo script does the following:
+
+- Builds the Inference Engine security barrier camera sample (inference_engine\samples\security_barrier_camera_sample)
+- Runs the sample with the car_1.bmp located in the demo folder
+
+The sample application displays the resulting frame with detections rendered as bounding boxes and text.
+
+For more information about the Inference Engine security barrier camera sample, refer to the documentation available in the sample folder.
+
+
+Benchmark Demo Using SqueezeNet
+===============================
+
+The demo illustrates how to use the Benchmark Application to estimate deep learning inference performance on supported devices.
+
+The demo script does the following:
+
+ - Downloads a public SqueezeNet model using the Model Downloader (open_model_zoo\tools\downloader\downloader.py)
+ - Installs all prerequisites required for running the Model Optimizer using the scripts from the "model_optimizer\install_prerequisites" folder
+ - Converts SqueezeNet to an IR using the Model Optimizer (model_optimizer\mo.py) via the Model Converter (open_model_zoo\tools\downloader\converter.py)
+ - Builds the Inference Engine benchmark tool (inference_engine\samples\demo_benchmark_app)
+ - Runs the tool with the car.png picture located in the demo folder
+
+The benchmark app prints performance counters, resulting latency, and throughput values.
+
+For more information about the Inference Engine benchmark app, refer to the documentation available in the sample folder.
\ No newline at end of file
--- /dev/null
+:: Copyright (C) 2018-2019 Intel Corporation
+:: SPDX-License-Identifier: Apache-2.0
+
+@echo off
+setlocal enabledelayedexpansion
+
+set TARGET=CPU
+set BUILD_FOLDER=%USERPROFILE%\Documents\Intel\OpenVINO
+
+:: command line arguments parsing
+:input_arguments_loop
+if not "%1"=="" (
+ if "%1"=="-d" (
+ set TARGET=%2
+ echo target = !TARGET!
+ shift
+ )
+ if "%1"=="-sample-options" (
+ set SAMPLE_OPTIONS=%2 %3 %4 %5 %6
+ echo sample_options = !SAMPLE_OPTIONS!
+ shift
+ )
+ if "%1"=="-help" (
+ echo %~n0%~x0 is benchmark demo using public SqueezeNet topology
+ echo.
+ echo Options:
+ echo -d name Specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD are acceptable. Sample will look for a suitable plugin for device specified
+ exit /b
+ )
+ shift
+ goto :input_arguments_loop
+)
+
+IF "%SAMPLE_OPTIONS%"=="" (
+ set SAMPLE_OPTIONS=-niter 1000
+)
+
+set ROOT_DIR=%~dp0
+
+set TARGET_PRECISION=FP16
+
+
+echo target_precision = !TARGET_PRECISION!
+
+set models_path=%BUILD_FOLDER%\openvino_models\models
+set models_cache=%BUILD_FOLDER%\openvino_models\cache
+set irs_path=%BUILD_FOLDER%\openvino_models\ir
+
+set model_name=squeezenet1.1
+
+set target_image_path=%ROOT_DIR%car.png
+
+if exist "%ROOT_DIR%..\..\bin\setupvars.bat" (
+ call "%ROOT_DIR%..\..\bin\setupvars.bat"
+) else (
+ echo setupvars.bat is not found, INTEL_OPENVINO_DIR can't be set
+ goto error
+)
+
+echo INTEL_OPENVINO_DIR is set to %INTEL_OPENVINO_DIR%
+
+:: Check if Python is installed
+python --version 2>NUL
+if errorlevel 1 (
+ echo Error^: Python is not installed. Please install Python 3.5 ^(64-bit^) or higher from https://www.python.org/downloads/
+ goto error
+)
+
+:: Check if Python version is equal or higher 3.4
+for /F "tokens=* USEBACKQ" %%F IN (`python --version 2^>^&1`) DO (
+ set version=%%F
+)
+echo %var%
+
+for /F "tokens=1,2,3 delims=. " %%a in ("%version%") do (
+ set Major=%%b
+ set Minor=%%c
+)
+
+if "%Major%" geq "3" (
+ if "%Minor%" geq "5" (
+ set python_ver=okay
+ )
+)
+if not "%python_ver%"=="okay" (
+ echo Unsupported Python version. Please install Python 3.5 ^(64-bit^) or higher from https://www.python.org/downloads/
+ goto error
+)
+
+:: install yaml python modules required for downloader.py
+pip3 install --user -r "%ROOT_DIR%..\open_model_zoo\tools\downloader\requirements.in"
+if ERRORLEVEL 1 GOTO errorHandling
+
+set downloader_dir=%INTEL_OPENVINO_DIR%\deployment_tools\open_model_zoo\tools\downloader
+
+for /F "tokens=* usebackq" %%d in (
+ `python "%downloader_dir%\info_dumper.py" --name "%model_name%" ^|
+ python -c "import sys, json; print(json.load(sys.stdin)[0]['subdirectory'])"`
+) do (
+ set model_dir=%%d
+)
+
+set ir_dir=%irs_path%\%model_dir%\%target_precision%
+
+echo Download public %model_name% model
+echo python "%downloader_dir%\downloader.py" --name %model_name% --output_dir %models_path% --cache_dir %models_cache%
+python "%downloader_dir%\downloader.py" --name %model_name% --output_dir %models_path% --cache_dir %models_cache%
+echo %model_name% model downloading completed
+
+timeout 7
+
+if exist %ir_dir% (
+ echo.
+ echo Target folder %ir_dir% already exists. Skipping IR generation with Model Optimizer.
+ echo If you want to convert a model again, remove the entire %ir_dir% folder.
+ timeout 7
+ GOTO buildSample
+)
+
+echo.
+echo ###############^|^| Install Model Optimizer prerequisites ^|^|###############
+echo.
+timeout 3
+cd "%INTEL_OPENVINO_DIR%\deployment_tools\model_optimizer\install_prerequisites"
+call install_prerequisites_caffe.bat
+if ERRORLEVEL 1 GOTO errorHandling
+
+timeout 7
+echo.
+echo ###############^|^| Run Model Optimizer ^|^|###############
+echo.
+timeout 3
+
+::set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=cpp
+echo python "%downloader_dir%\converter.py" --mo "%INTEL_OPENVINO_DIR%\deployment_tools\model_optimizer\mo.py" --name "%model_name%" -d "%models_path%" -o "%irs_path%" --precisions "%TARGET_PRECISION%"
+python "%downloader_dir%\converter.py" --mo "%INTEL_OPENVINO_DIR%\deployment_tools\model_optimizer\mo.py" --name "%model_name%" -d "%models_path%" -o "%irs_path%" --precisions "%TARGET_PRECISION%"
+if ERRORLEVEL 1 GOTO errorHandling
+
+timeout 7
+
+:buildSample
+echo.
+echo ###############^|^| Generate VS solution for Inference Engine samples using cmake ^|^|###############
+echo.
+timeout 3
+
+if "%PROCESSOR_ARCHITECTURE%" == "AMD64" (
+ set "PLATFORM=x64"
+) else (
+ set "PLATFORM=Win32"
+)
+
+set VSWHERE="false"
+if exist "%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" (
+ set VSWHERE="true"
+ cd "%ProgramFiles(x86)%\Microsoft Visual Studio\Installer"
+) else if exist "%ProgramFiles%\Microsoft Visual Studio\Installer\vswhere.exe" (
+ set VSWHERE="true"
+ cd "%ProgramFiles%\Microsoft Visual Studio\Installer"
+) else (
+ echo "vswhere tool is not found"
+)
+
+set MSBUILD_BIN=
+set VS_PATH=
+
+if !VSWHERE! == "true" (
+ for /f "usebackq tokens=*" %%i in (`vswhere -latest -products * -requires Microsoft.Component.MSBuild -property installationPath`) do (
+ set VS_PATH=%%i
+ )
+ if exist "!VS_PATH!\MSBuild\14.0\Bin\MSBuild.exe" (
+ set "MSBUILD_BIN=!VS_PATH!\MSBuild\14.0\Bin\MSBuild.exe"
+ )
+ if exist "!VS_PATH!\MSBuild\15.0\Bin\MSBuild.exe" (
+ set "MSBUILD_BIN=!VS_PATH!\MSBuild\15.0\Bin\MSBuild.exe"
+ )
+ if exist "!VS_PATH!\MSBuild\Current\Bin\MSBuild.exe" (
+ set "MSBUILD_BIN=!VS_PATH!\MSBuild\Current\Bin\MSBuild.exe"
+ )
+)
+
+if "!MSBUILD_BIN!" == "" (
+ if exist "C:\Program Files (x86)\MSBuild\14.0\Bin\MSBuild.exe" (
+ set "MSBUILD_BIN=C:\Program Files (x86)\MSBuild\14.0\Bin\MSBuild.exe"
+ set "MSBUILD_VERSION=14 2015"
+ )
+ if exist "C:\Program Files (x86)\Microsoft Visual Studio\2017\BuildTools\MSBuild\15.0\Bin\MSBuild.exe" (
+ set "MSBUILD_BIN=C:\Program Files (x86)\Microsoft Visual Studio\2017\BuildTools\MSBuild\15.0\Bin\MSBuild.exe"
+ set "MSBUILD_VERSION=15 2017"
+ )
+ if exist "C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\MSBuild\15.0\Bin\MSBuild.exe" (
+ set "MSBUILD_BIN=C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\MSBuild\15.0\Bin\MSBuild.exe"
+ set "MSBUILD_VERSION=15 2017"
+ )
+ if exist "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\MSBuild\15.0\Bin\MSBuild.exe" (
+ set "MSBUILD_BIN=C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\MSBuild\15.0\Bin\MSBuild.exe"
+ set "MSBUILD_VERSION=15 2017"
+ )
+) else (
+ if not "!MSBUILD_BIN:2019=!"=="!MSBUILD_BIN!" set "MSBUILD_VERSION=16 2019"
+ if not "!MSBUILD_BIN:2017=!"=="!MSBUILD_BIN!" set "MSBUILD_VERSION=15 2017"
+ if not "!MSBUILD_BIN:2015=!"=="!MSBUILD_BIN!" set "MSBUILD_VERSION=14 2015"
+)
+
+if "!MSBUILD_BIN!" == "" (
+ echo Build tools for Visual Studio 2015 / 2017 / 2019 cannot be found. If you use Visual Studio 2017, please download and install build tools from https://www.visualstudio.com/downloads/#build-tools-for-visual-studio-2017
+ GOTO errorHandling
+)
+
+set "SOLUTION_DIR64=%BUILD_FOLDER%\inference_engine_samples_build"
+
+echo Creating Visual Studio !MSBUILD_VERSION! %PLATFORM% files in %SOLUTION_DIR64%... && ^
+if exist "%SOLUTION_DIR64%\CMakeCache.txt" del "%SOLUTION_DIR64%\CMakeCache.txt"
+cd "%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\samples\cpp" && cmake -E make_directory "%SOLUTION_DIR64%" && cd "%SOLUTION_DIR64%" && cmake -G "Visual Studio !MSBUILD_VERSION!" -A %PLATFORM% "%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\samples\cpp"
+if ERRORLEVEL 1 GOTO errorHandling
+
+timeout 7
+
+echo.
+echo ###############^|^| Build Inference Engine samples using MS Visual Studio (MSBuild.exe) ^|^|###############
+echo.
+timeout 3
+echo !MSBUILD_BIN!" Samples.sln /p:Configuration=Release /t:benchmark_app /clp:ErrorsOnly /m
+"!MSBUILD_BIN!" Samples.sln /p:Configuration=Release /t:benchmark_app /clp:ErrorsOnly /m
+
+if ERRORLEVEL 1 GOTO errorHandling
+
+timeout 7
+
+:runSample
+echo.
+echo ###############^|^| Run Inference Engine benchmark app ^|^|###############
+echo.
+timeout 3
+copy /Y "%ROOT_DIR%%model_name%.labels" "%ir_dir%"
+cd "%SOLUTION_DIR64%\intel64\Release"
+
+echo benchmark_app.exe -i "%target_image_path%" -m "%ir_dir%\%model_name%.xml" -pc -d !TARGET! !SAMPLE_OPTIONS!
+benchmark_app.exe -i "%target_image_path%" -m "%ir_dir%\%model_name%.xml" -pc -d !TARGET! !SAMPLE_OPTIONS!
+
+if ERRORLEVEL 1 GOTO errorHandling
+
+echo.
+echo ###############^|^| Inference Engine benchmark app completed successfully ^|^|###############
+
+timeout 10
+cd "%ROOT_DIR%"
+
+goto :eof
+
+:errorHandling
+echo Error
+cd "%ROOT_DIR%"
--- /dev/null
+#!/usr/bin/env bash
+
+# Copyright (C) 2018-2019 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+. "$ROOT_DIR/utils.sh"
+
+usage() {
+ echo "Benchmark demo using public SqueezeNet topology"
+ echo "-d name specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD are acceptable. Sample will look for a suitable plugin for device specified"
+ echo "-help print help message"
+ exit 1
+}
+
+trap 'error ${LINENO}' ERR
+
+target="CPU"
+
+# parse command line options
+while [[ $# -gt 0 ]]
+do
+key="$1"
+
+case $key in
+ -h | -help | --help)
+ usage
+ ;;
+ -d)
+ target="$2"
+ echo target = "${target}"
+ shift
+ ;;
+ -sample-options)
+ sampleoptions="$2 $3 $4 $5 $6"
+ echo sample-options = "${sampleoptions}"
+ shift
+ ;;
+ *)
+ # unknown option
+ ;;
+esac
+shift
+done
+
+if ([ -z "$sampleoptions" ]); then
+ sampleoptions="-niter 1000"
+fi
+
+target_precision="FP16"
+
+printf "target_precision = ${target_precision}\n"
+
+models_path="$HOME/openvino_models/models"
+models_cache="$HOME/openvino_models/cache"
+irs_path="$HOME/openvino_models/ir"
+
+model_name="squeezenet1.1"
+
+target_image_path="$ROOT_DIR/car.png"
+
+run_again="Then run the script again\n\n"
+dashes="\n\n###################################################\n\n"
+
+
+if [ -e "$ROOT_DIR/../../bin/setupvars.sh" ]; then
+ setupvars_path="$ROOT_DIR/../../bin/setupvars.sh"
+else
+ printf "Error: setupvars.sh is not found\n"
+fi
+
+if ! . $setupvars_path ; then
+ printf "Unable to run ./setupvars.sh. Please check its presence. ${run_again}"
+ exit 1
+fi
+
+# Step 1. Download the Caffe model and the prototxt of the model
+printf "${dashes}"
+printf "\n\nDownloading the Caffe model and the prototxt"
+
+cur_path=$PWD
+
+printf "\nInstalling dependencies\n"
+
+if [[ -f /etc/centos-release ]]; then
+ DISTRO="centos"
+elif [[ -f /etc/lsb-release ]]; then
+ DISTRO="ubuntu"
+fi
+
+if [[ $DISTRO == "centos" ]]; then
+ sudo -E yum install -y centos-release-scl epel-release
+ sudo -E yum install -y gcc gcc-c++ make glibc-static glibc-devel libstdc++-static libstdc++-devel libstdc++ libgcc \
+ glibc-static.i686 glibc-devel.i686 libstdc++-static.i686 libstdc++.i686 libgcc.i686 cmake
+
+ sudo -E rpm -Uvh http://li.nux.ro/download/nux/dextop/el7/x86_64/nux-dextop-release-0-1.el7.nux.noarch.rpm || true
+ sudo -E yum install -y epel-release
+ sudo -E yum install -y cmake ffmpeg gstreamer1 gstreamer1-plugins-base libusbx-devel
+
+ # check installed Python version
+ if command -v python3.5 >/dev/null 2>&1; then
+ python_binary=python3.5
+ pip_binary=pip3.5
+ fi
+ if command -v python3.6 >/dev/null 2>&1; then
+ python_binary=python3.6
+ pip_binary=pip3.6
+ fi
+ if [ -z "$python_binary" ]; then
+ sudo -E yum install -y rh-python36 || true
+ . scl_source enable rh-python36
+ python_binary=python3.6
+ pip_binary=pip3.6
+ fi
+elif [[ $DISTRO == "ubuntu" ]]; then
+ sudo -E apt update
+ print_and_run sudo -E apt -y install build-essential python3-pip virtualenv cmake libcairo2-dev libpango1.0-dev libglib2.0-dev libgtk2.0-dev libswscale-dev libavcodec-dev libavformat-dev libgstreamer1.0-0 gstreamer1.0-plugins-base
+ python_binary=python3
+ pip_binary=pip3
+
+ system_ver=`cat /etc/lsb-release | grep -i "DISTRIB_RELEASE" | cut -d "=" -f2`
+ if [ $system_ver = "18.04" ]; then
+ sudo -E apt-get install -y libpng-dev
+ else
+ sudo -E apt-get install -y libpng12-dev
+ fi
+elif [[ "$OSTYPE" == "darwin"* ]]; then
+ # check installed Python version
+ if command -v python3.7 >/dev/null 2>&1; then
+ python_binary=python3.7
+ pip_binary=pip3.7
+ elif command -v python3.6 >/dev/null 2>&1; then
+ python_binary=python3.6
+ pip_binary=pip3.6
+ elif command -v python3.5 >/dev/null 2>&1; then
+ python_binary=python3.5
+ pip_binary=pip3.5
+ else
+ python_binary=python3
+ pip_binary=pip3
+ fi
+fi
+
+if ! command -v $python_binary &>/dev/null; then
+ printf "\n\nPython 3.5 (x64) or higher is not installed. It is required to run Model Optimizer, please install it. ${run_again}"
+ exit 1
+fi
+
+if [[ "$OSTYPE" == "darwin"* ]]; then
+ $pip_binary install -r $ROOT_DIR/../open_model_zoo/tools/downloader/requirements.in
+else
+ sudo -E $pip_binary install -r $ROOT_DIR/../open_model_zoo/tools/downloader/requirements.in
+fi
+
+downloader_dir="${INTEL_OPENVINO_DIR}/deployment_tools/open_model_zoo/tools/downloader"
+
+model_dir=$("$python_binary" "$downloader_dir/info_dumper.py" --name "$model_name" |
+ "$python_binary" -c 'import sys, json; print(json.load(sys.stdin)[0]["subdirectory"])')
+
+downloader_path="$downloader_dir/downloader.py"
+
+print_and_run "$python_binary" "$downloader_path" --name "$model_name" --output_dir "${models_path}" --cache_dir "${models_cache}"
+
+ir_dir="${irs_path}/${model_dir}/${target_precision}"
+
+if [ ! -e "$ir_dir" ]; then
+ # Step 2. Configure Model Optimizer
+ printf "${dashes}"
+ printf "Install Model Optimizer dependencies\n\n"
+ cd "${INTEL_OPENVINO_DIR}/deployment_tools/model_optimizer/install_prerequisites"
+ . ./install_prerequisites.sh caffe
+ cd $cur_path
+
+ # Step 3. Convert a model with Model Optimizer
+ printf "${dashes}"
+ printf "Convert a model with Model Optimizer\n\n"
+
+ mo_path="${INTEL_OPENVINO_DIR}/deployment_tools/model_optimizer/mo.py"
+
+ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=cpp
+ print_and_run "$python_binary" "$downloader_dir/converter.py" --mo "$mo_path" --name "$model_name" -d "$models_path" -o "$irs_path" --precisions "$target_precision"
+else
+ printf "\n\nTarget folder ${ir_dir} already exists. Skipping IR generation with Model Optimizer."
+ printf "If you want to convert a model again, remove the entire ${ir_dir} folder. ${run_again}"
+fi
+
+# Step 4. Build samples
+printf "${dashes}"
+printf "Build Inference Engine samples\n\n"
+
+OS_PATH=$(uname -m)
+NUM_THREADS="-j2"
+
+if [ $OS_PATH == "x86_64" ]; then
+ OS_PATH="intel64"
+ NUM_THREADS="-j8"
+fi
+
+samples_path="${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/samples/cpp"
+build_dir="$HOME/inference_engine_samples_build"
+binaries_dir="${build_dir}/${OS_PATH}/Release"
+
+if [ -e $build_dir/CMakeCache.txt ]; then
+ rm -rf $build_dir/CMakeCache.txt
+fi
+mkdir -p $build_dir
+cd $build_dir
+cmake -DCMAKE_BUILD_TYPE=Release $samples_path
+
+make $NUM_THREADS benchmark_app
+
+# Step 5. Run samples
+printf "${dashes}"
+printf "Run Inference Engine benchmark app\n\n"
+
+cd $binaries_dir
+
+cp -f $ROOT_DIR/${model_name}.labels ${ir_dir}/
+
+print_and_run ./benchmark_app -d "$target" -i "$target_image_path" -m "${ir_dir}/${model_name}.xml" -pc ${sampleoptions}
+
+printf "${dashes}"
+
+printf "Inference Engine benchmark app completed successfully.\n\n"
--- /dev/null
+:: Copyright (C) 2018-2019 Intel Corporation
+:: SPDX-License-Identifier: Apache-2.0
+
+@echo off
+setlocal enabledelayedexpansion
+
+set TARGET=CPU
+set SAMPLE_OPTIONS=
+set BUILD_FOLDER=%USERPROFILE%\Documents\Intel\OpenVINO
+
+:: command line arguments parsing
+:input_arguments_loop
+if not "%1"=="" (
+ if "%1"=="-d" (
+ set TARGET=%2
+ echo target = !TARGET!
+ shift
+ )
+ if "%1"=="-sample-options" (
+ set SAMPLE_OPTIONS=%2 %3 %4 %5 %6
+ echo sample_options = !SAMPLE_OPTIONS!
+ shift
+ )
+ if "%1"=="-help" (
+ echo %~n0%~x0 is security barrier camera demo that showcases three models coming with the product
+ echo.
+ echo Options:
+ echo -d name Specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD are acceptable. Sample will look for a suitable plugin for device specified
+ exit /b
+ )
+ shift
+ goto :input_arguments_loop
+)
+
+set ROOT_DIR=%~dp0
+
+set target_image_path=%ROOT_DIR%car_1.bmp
+
+
+set TARGET_PRECISION=FP16
+
+
+echo target_precision = !TARGET_PRECISION!
+
+if exist "%ROOT_DIR%..\..\bin\setupvars.bat" (
+ call "%ROOT_DIR%..\..\bin\setupvars.bat"
+) else (
+ echo setupvars.bat is not found, INTEL_OPENVINO_DIR can't be set
+ goto error
+)
+
+echo INTEL_OPENVINO_DIR is set to %INTEL_OPENVINO_DIR%
+
+:: Check if Python is installed
+python --version 2>NUL
+if errorlevel 1 (
+ echo Error^: Python is not installed. Please install Python 3.5 ^(64-bit^) or higher from https://www.python.org/downloads/
+ goto error
+)
+
+:: Check if Python version is equal or higher 3.4
+for /F "tokens=* USEBACKQ" %%F IN (`python --version 2^>^&1`) DO (
+ set version=%%F
+)
+echo %var%
+
+for /F "tokens=1,2,3 delims=. " %%a in ("%version%") do (
+ set Major=%%b
+ set Minor=%%c
+)
+
+if "%Major%" geq "3" (
+ if "%Minor%" geq "5" (
+ set python_ver=okay
+ )
+)
+if not "%python_ver%"=="okay" (
+ echo Unsupported Python version. Please install Python 3.5 ^(64-bit^) or higher from https://www.python.org/downloads/
+ goto error
+)
+
+:: install yaml python modules required for downloader.py
+pip3 install --user -r "%ROOT_DIR%..\open_model_zoo\tools\downloader\requirements.in"
+if ERRORLEVEL 1 GOTO errorHandling
+
+
+set models_path=%BUILD_FOLDER%\openvino_models\ir
+set models_cache=%BUILD_FOLDER%\openvino_models\cache
+
+if not exist %models_cache% (
+ mkdir %models_cache%
+)
+
+set downloader_dir=%INTEL_OPENVINO_DIR%\deployment_tools\open_model_zoo\tools\downloader
+
+for /F "tokens=1,2 usebackq" %%a in ("%ROOT_DIR%demo_security_barrier_camera.conf") do (
+ echo python "%downloader_dir%\downloader.py" --name "%%b" --output_dir "%models_path%" --cache_dir "%models_cache%"
+ python "%downloader_dir%\downloader.py" --name "%%b" --output_dir "%models_path%" --cache_dir "%models_cache%"
+
+ for /F "tokens=* usebackq" %%d in (
+ `python "%downloader_dir%\info_dumper.py" --name "%%b" ^|
+ python -c "import sys, json; print(json.load(sys.stdin)[0]['subdirectory'])"`
+ ) do (
+ set model_args=!model_args! %%a "%models_path%\%%d\%target_precision%\%%b.xml"
+ )
+)
+
+echo.
+echo ###############^|^| Generate VS solution for Inference Engine demos using cmake ^|^|###############
+echo.
+timeout 3
+
+if "%PROCESSOR_ARCHITECTURE%" == "AMD64" (
+ set "PLATFORM=x64"
+) else (
+ set "PLATFORM=Win32"
+)
+
+set VSWHERE="false"
+if exist "%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" (
+ set VSWHERE="true"
+ cd "%ProgramFiles(x86)%\Microsoft Visual Studio\Installer"
+) else if exist "%ProgramFiles%\Microsoft Visual Studio\Installer\vswhere.exe" (
+ set VSWHERE="true"
+ cd "%ProgramFiles%\Microsoft Visual Studio\Installer"
+) else (
+ echo "vswhere tool is not found"
+)
+
+set MSBUILD_BIN=
+set VS_PATH=
+
+if !VSWHERE! == "true" (
+ for /f "usebackq tokens=*" %%i in (`vswhere -latest -products * -requires Microsoft.Component.MSBuild -property installationPath`) do (
+ set VS_PATH=%%i
+ )
+ if exist "!VS_PATH!\MSBuild\14.0\Bin\MSBuild.exe" (
+ set "MSBUILD_BIN=!VS_PATH!\MSBuild\14.0\Bin\MSBuild.exe"
+ )
+ if exist "!VS_PATH!\MSBuild\15.0\Bin\MSBuild.exe" (
+ set "MSBUILD_BIN=!VS_PATH!\MSBuild\15.0\Bin\MSBuild.exe"
+ )
+ if exist "!VS_PATH!\MSBuild\Current\Bin\MSBuild.exe" (
+ set "MSBUILD_BIN=!VS_PATH!\MSBuild\Current\Bin\MSBuild.exe"
+ )
+)
+
+if "!MSBUILD_BIN!" == "" (
+ if exist "C:\Program Files (x86)\MSBuild\14.0\Bin\MSBuild.exe" (
+ set "MSBUILD_BIN=C:\Program Files (x86)\MSBuild\14.0\Bin\MSBuild.exe"
+ set "MSBUILD_VERSION=14 2015"
+ )
+ if exist "C:\Program Files (x86)\Microsoft Visual Studio\2017\BuildTools\MSBuild\15.0\Bin\MSBuild.exe" (
+ set "MSBUILD_BIN=C:\Program Files (x86)\Microsoft Visual Studio\2017\BuildTools\MSBuild\15.0\Bin\MSBuild.exe"
+ set "MSBUILD_VERSION=15 2017"
+ )
+ if exist "C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\MSBuild\15.0\Bin\MSBuild.exe" (
+ set "MSBUILD_BIN=C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\MSBuild\15.0\Bin\MSBuild.exe"
+ set "MSBUILD_VERSION=15 2017"
+ )
+ if exist "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\MSBuild\15.0\Bin\MSBuild.exe" (
+ set "MSBUILD_BIN=C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\MSBuild\15.0\Bin\MSBuild.exe"
+ set "MSBUILD_VERSION=15 2017"
+ )
+) else (
+ if not "!MSBUILD_BIN:2019=!"=="!MSBUILD_BIN!" set "MSBUILD_VERSION=16 2019"
+ if not "!MSBUILD_BIN:2017=!"=="!MSBUILD_BIN!" set "MSBUILD_VERSION=15 2017"
+ if not "!MSBUILD_BIN:2015=!"=="!MSBUILD_BIN!" set "MSBUILD_VERSION=14 2015"
+)
+
+if "!MSBUILD_BIN!" == "" (
+ echo Build tools for Visual Studio 2015 / 2017 / 2019 cannot be found. If you use Visual Studio 2017 / 2019, please download and install build tools from https://www.visualstudio.com/downloads/#build-tools-for-visual-studio-2017
+ GOTO errorHandling
+)
+
+set "SOLUTION_DIR64=%BUILD_FOLDER%\inference_engine_demos_build"
+
+echo Creating Visual Studio !MSBUILD_VERSION! %PLATFORM% files in %SOLUTION_DIR64%... && ^
+if exist "%SOLUTION_DIR64%\CMakeCache.txt" del "%SOLUTION_DIR64%\CMakeCache.txt"
+cd "%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\demos" && cmake -E make_directory "%SOLUTION_DIR64%" && cd "%SOLUTION_DIR64%" && cmake -G "Visual Studio !MSBUILD_VERSION!" -A %PLATFORM% "%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\demos"
+if ERRORLEVEL 1 GOTO errorHandling
+
+timeout 7
+echo.
+echo ###############^|^| Build Inference Engine demos using MS Visual Studio (MSBuild.exe) ^|^|###############
+echo.
+timeout 3
+echo "!MSBUILD_BIN!" Demos.sln /p:Configuration=Release /t:security_barrier_camera_demo /clp:ErrorsOnly /m
+"!MSBUILD_BIN!" Demos.sln /p:Configuration=Release /t:security_barrier_camera_demo /clp:ErrorsOnly /m
+if ERRORLEVEL 1 GOTO errorHandling
+
+timeout 7
+
+:runSample
+echo.
+echo ###############^|^| Run Inference Engine security barrier camera demo ^|^|###############
+echo.
+timeout 3
+cd "%SOLUTION_DIR64%\intel64\Release"
+echo "%SOLUTION_DIR64%\intel64\Release\security_barrier_camera_demo.exe" -i "%target_image_path%" %model_args% -d !TARGET! -d_va !TARGET! -d_lpr !TARGET! !SAMPLE_OPTIONS!
+security_barrier_camera_demo.exe -i "%target_image_path%" %model_args% ^
+ -d !TARGET! -d_va !TARGET! -d_lpr !TARGET! !SAMPLE_OPTIONS!
+if ERRORLEVEL 1 GOTO errorHandling
+
+echo.
+echo ###############^|^| Demo completed successfully ^|^|###############
+cd "%ROOT_DIR%"
+
+goto :eof
+
+:errorHandling
+echo Error
+cd "%ROOT_DIR%"
--- /dev/null
+-m vehicle-license-plate-detection-barrier-0106
+-m_lpr license-plate-recognition-barrier-0001
+-m_va vehicle-attributes-recognition-barrier-0039
--- /dev/null
+#!/usr/bin/env bash
+
+# Copyright (C) 2018-2019 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+. "$ROOT_DIR/utils.sh"
+
+usage() {
+ echo "Security barrier camera demo that showcases three models coming with the product"
+ echo "-d name specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD are acceptable. Sample will look for a suitable plugin for device specified"
+ echo "-help print help message"
+ exit 1
+}
+
+trap 'error ${LINENO}' ERR
+
+target="CPU"
+
+# parse command line options
+while [[ $# -gt 0 ]]
+do
+key="$1"
+
+case $key in
+ -h | -help | --help)
+ usage
+ ;;
+ -d)
+ target="$2"
+ echo target = "${target}"
+ shift
+ ;;
+ -sample-options)
+ sampleoptions="$2 $3 $4 $5 $6"
+ echo sample-options = "${sampleoptions}"
+ shift
+ ;;
+ *)
+ # unknown option
+ ;;
+esac
+shift
+done
+
+
+target_image_path="$ROOT_DIR/car_1.bmp"
+
+run_again="Then run the script again\n\n"
+dashes="\n\n###################################################\n\n"
+
+if [[ -f /etc/centos-release ]]; then
+ DISTRO="centos"
+elif [[ -f /etc/lsb-release ]]; then
+ DISTRO="ubuntu"
+elif [[ "$OSTYPE" == "darwin"* ]]; then
+ DISTRO="macos"
+fi
+
+if [[ $DISTRO == "centos" ]]; then
+ sudo -E yum install -y centos-release-scl epel-release
+ sudo -E yum install -y gcc gcc-c++ make glibc-static glibc-devel libstdc++-static libstdc++-devel libstdc++ libgcc \
+ glibc-static.i686 glibc-devel.i686 libstdc++-static.i686 libstdc++.i686 libgcc.i686 cmake
+
+ sudo -E rpm -Uvh http://li.nux.ro/download/nux/dextop/el7/x86_64/nux-dextop-release-0-1.el7.nux.noarch.rpm || true
+ sudo -E yum install -y epel-release
+ sudo -E yum install -y cmake ffmpeg gstreamer1 gstreamer1-plugins-base libusbx-devel
+
+ # check installed Python version
+ if command -v python3.5 >/dev/null 2>&1; then
+ python_binary=python3.5
+ pip_binary=pip3.5
+ fi
+ if command -v python3.6 >/dev/null 2>&1; then
+ python_binary=python3.6
+ pip_binary=pip3.6
+ fi
+ if [ -z "$python_binary" ]; then
+ sudo -E yum install -y rh-python36 || true
+ . scl_source enable rh-python36
+ python_binary=python3.6
+ pip_binary=pip3.6
+ fi
+elif [[ $DISTRO == "ubuntu" ]]; then
+ sudo -E apt update
+ print_and_run sudo -E apt -y install build-essential python3-pip virtualenv cmake libcairo2-dev libpango1.0-dev libglib2.0-dev libgtk2.0-dev libswscale-dev libavcodec-dev libavformat-dev libgstreamer1.0-0 gstreamer1.0-plugins-base
+ python_binary=python3
+ pip_binary=pip3
+
+ system_ver=`cat /etc/lsb-release | grep -i "DISTRIB_RELEASE" | cut -d "=" -f2`
+ if [ $system_ver = "18.04" ]; then
+ sudo -E apt-get install -y libpng-dev
+ else
+ sudo -E apt-get install -y libpng12-dev
+ fi
+elif [[ "$OSTYPE" == "darwin"* ]]; then
+ # check installed Python version
+ if command -v python3.7 >/dev/null 2>&1; then
+ python_binary=python3.7
+ pip_binary=pip3.7
+ elif command -v python3.6 >/dev/null 2>&1; then
+ python_binary=python3.6
+ pip_binary=pip3.6
+ elif command -v python3.5 >/dev/null 2>&1; then
+ python_binary=python3.5
+ pip_binary=pip3.5
+ else
+ python_binary=python3
+ pip_binary=pip3
+ fi
+fi
+
+if ! command -v $python_binary &>/dev/null; then
+ printf "\n\nPython 3.5 (x64) or higher is not installed. It is required to run Model Optimizer, please install it. ${run_again}"
+ exit 1
+fi
+
+if [[ $DISTRO == "macos" ]]; then
+ $pip_binary install -r $ROOT_DIR/../open_model_zoo/tools/downloader/requirements.in
+else
+ sudo -E $pip_binary install -r $ROOT_DIR/../open_model_zoo/tools/downloader/requirements.in
+fi
+
+if [ -e "$ROOT_DIR/../../bin/setupvars.sh" ]; then
+ setupvars_path="$ROOT_DIR/../../bin/setupvars.sh"
+else
+ printf "Error: setupvars.sh is not found\n"
+fi
+if ! . $setupvars_path ; then
+ printf "Unable to run ./setupvars.sh. Please check its presence. ${run_again}"
+ exit 1
+fi
+
+# Step 1. Downloading Intel models
+printf "${dashes}"
+printf "Downloading Intel models\n\n"
+
+
+target_precision="FP16"
+
+printf "target_precision = ${target_precision}\n"
+
+downloader_dir="${INTEL_OPENVINO_DIR}/deployment_tools/open_model_zoo/tools/downloader"
+
+downloader_path="$downloader_dir/downloader.py"
+models_path="$HOME/openvino_models/ir"
+models_cache="$HOME/openvino_models/cache"
+
+declare -a model_args
+
+while read -r model_opt model_name; do
+ model_subdir=$("$python_binary" "$downloader_dir/info_dumper.py" --name "$model_name" |
+ "$python_binary" -c 'import sys, json; print(json.load(sys.stdin)[0]["subdirectory"])')
+
+ model_path="$models_path/$model_subdir/$target_precision/$model_name"
+
+ print_and_run "$python_binary" "$downloader_path" --name "$model_name" --output_dir "$models_path" --cache_dir "$models_cache"
+
+ model_args+=("$model_opt" "${model_path}.xml")
+done < "$ROOT_DIR/demo_security_barrier_camera.conf"
+
+# Step 2. Build samples
+printf "${dashes}"
+printf "Build Inference Engine demos\n\n"
+
+demos_path="${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/demos"
+
+if ! command -v cmake &>/dev/null; then
+ printf "\n\nCMAKE is not installed. It is required to build Inference Engine demos. Please install it. ${run_again}"
+ exit 1
+fi
+
+OS_PATH=$(uname -m)
+NUM_THREADS="-j2"
+
+if [ $OS_PATH == "x86_64" ]; then
+ OS_PATH="intel64"
+ NUM_THREADS="-j8"
+fi
+
+build_dir="$HOME/inference_engine_demos_build"
+if [ -e $build_dir/CMakeCache.txt ]; then
+ rm -rf $build_dir/CMakeCache.txt
+fi
+mkdir -p $build_dir
+cd $build_dir
+cmake -DCMAKE_BUILD_TYPE=Release $demos_path
+make $NUM_THREADS security_barrier_camera_demo
+
+# Step 3. Run samples
+printf "${dashes}"
+printf "Run Inference Engine security_barrier_camera demo\n\n"
+
+binaries_dir="${build_dir}/${OS_PATH}/Release"
+cd $binaries_dir
+
+print_and_run ./security_barrier_camera_demo -d "$target" -d_va "$target" -d_lpr "$target" -i "$target_image_path" "${model_args[@]}" ${sampleoptions}
+
+printf "${dashes}"
+printf "Demo completed successfully.\n\n"
--- /dev/null
+:: Copyright (C) 2018-2019 Intel Corporation
+:: SPDX-License-Identifier: Apache-2.0
+
+@echo off
+setlocal enabledelayedexpansion
+
+set TARGET=CPU
+set BUILD_FOLDER=%USERPROFILE%\Documents\Intel\OpenVINO
+
+:: command line arguments parsing
+:input_arguments_loop
+if not "%1"=="" (
+ if "%1"=="-d" (
+ set TARGET=%2
+ echo target = !TARGET!
+ shift
+ )
+ if "%1"=="-sample-options" (
+ set SAMPLE_OPTIONS=%2 %3 %4 %5 %6
+ echo sample_options = !SAMPLE_OPTIONS!
+ shift
+ )
+ if "%1"=="-help" (
+ echo %~n0%~x0 is classification demo using public SqueezeNet topology
+ echo.
+ echo Options:
+ echo -d name Specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD are acceptable. Sample will look for a suitable plugin for device specified
+ exit /b
+ )
+ shift
+ goto :input_arguments_loop
+)
+
+set ROOT_DIR=%~dp0
+
+set TARGET_PRECISION=FP16
+
+echo target_precision = !TARGET_PRECISION!
+
+set models_path=%BUILD_FOLDER%\openvino_models\models
+set models_cache=%BUILD_FOLDER%\openvino_models\cache
+set irs_path=%BUILD_FOLDER%\openvino_models\ir
+
+set model_name=squeezenet1.1
+
+set target_image_path=%ROOT_DIR%car.png
+
+if exist "%ROOT_DIR%..\..\bin\setupvars.bat" (
+ call "%ROOT_DIR%..\..\bin\setupvars.bat"
+) else (
+ echo setupvars.bat is not found, INTEL_OPENVINO_DIR can't be set
+ goto error
+)
+
+echo INTEL_OPENVINO_DIR is set to %INTEL_OPENVINO_DIR%
+
+:: Check if Python is installed
+python --version 2>NUL
+if errorlevel 1 (
+ echo Error^: Python is not installed. Please install Python 3.5 ^(64-bit^) or higher from https://www.python.org/downloads/
+ goto error
+)
+
+:: Check if Python version is equal or higher 3.4
+for /F "tokens=* USEBACKQ" %%F IN (`python --version 2^>^&1`) DO (
+ set version=%%F
+)
+echo %var%
+
+for /F "tokens=1,2,3 delims=. " %%a in ("%version%") do (
+ set Major=%%b
+ set Minor=%%c
+)
+
+if "%Major%" geq "3" (
+ if "%Minor%" geq "5" (
+ set python_ver=okay
+ )
+)
+if not "%python_ver%"=="okay" (
+ echo Unsupported Python version. Please install Python 3.5 ^(64-bit^) or higher from https://www.python.org/downloads/
+ goto error
+)
+
+:: install yaml python modules required for downloader.py
+pip3 install --user -r "%ROOT_DIR%..\open_model_zoo\tools\downloader\requirements.in"
+if ERRORLEVEL 1 GOTO errorHandling
+
+set downloader_dir=%INTEL_OPENVINO_DIR%\deployment_tools\open_model_zoo\tools\downloader
+
+for /F "tokens=* usebackq" %%d in (
+ `python "%downloader_dir%\info_dumper.py" --name "%model_name%" ^|
+ python -c "import sys, json; print(json.load(sys.stdin)[0]['subdirectory'])"`
+) do (
+ set model_dir=%%d
+)
+
+set ir_dir=%irs_path%\%model_dir%\%target_precision%
+
+echo Download public %model_name% model
+echo python "%downloader_dir%\downloader.py" --name %model_name% --output_dir %models_path% --cache_dir %models_cache%
+python "%downloader_dir%\downloader.py" --name %model_name% --output_dir %models_path% --cache_dir %models_cache%
+echo %model_name% model downloading completed
+
+timeout 7
+
+if exist %ir_dir% (
+ echo.
+ echo Target folder %ir_dir% already exists. Skipping IR generation with Model Optimizer.
+ echo If you want to convert a model again, remove the entire %ir_dir% folder.
+ timeout 7
+ GOTO buildSample
+)
+
+echo.
+echo ###############^|^| Install Model Optimizer prerequisites ^|^|###############
+echo.
+timeout 3
+cd "%INTEL_OPENVINO_DIR%\deployment_tools\model_optimizer\install_prerequisites"
+call install_prerequisites_caffe.bat
+if ERRORLEVEL 1 GOTO errorHandling
+
+timeout 7
+echo.
+echo ###############^|^| Run Model Optimizer ^|^|###############
+echo.
+timeout 3
+
+::set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=cpp
+echo python "%downloader_dir%\converter.py" --mo "%INTEL_OPENVINO_DIR%\deployment_tools\model_optimizer\mo.py" --name "%model_name%" -d "%models_path%" -o "%irs_path%" --precisions "%TARGET_PRECISION%"
+python "%downloader_dir%\converter.py" --mo "%INTEL_OPENVINO_DIR%\deployment_tools\model_optimizer\mo.py" --name "%model_name%" -d "%models_path%" -o "%irs_path%" --precisions "%TARGET_PRECISION%"
+if ERRORLEVEL 1 GOTO errorHandling
+
+timeout 7
+
+:buildSample
+echo.
+echo ###############^|^| Generate VS solution for Inference Engine samples using cmake ^|^|###############
+echo.
+timeout 3
+
+if "%PROCESSOR_ARCHITECTURE%" == "AMD64" (
+ set "PLATFORM=x64"
+) else (
+ set "PLATFORM=Win32"
+)
+
+set VSWHERE="false"
+if exist "%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" (
+ set VSWHERE="true"
+ cd "%ProgramFiles(x86)%\Microsoft Visual Studio\Installer"
+) else if exist "%ProgramFiles%\Microsoft Visual Studio\Installer\vswhere.exe" (
+ set VSWHERE="true"
+ cd "%ProgramFiles%\Microsoft Visual Studio\Installer"
+) else (
+ echo "vswhere tool is not found"
+)
+
+set MSBUILD_BIN=
+set VS_PATH=
+
+if !VSWHERE! == "true" (
+ for /f "usebackq tokens=*" %%i in (`vswhere -latest -products * -requires Microsoft.Component.MSBuild -property installationPath`) do (
+ set VS_PATH=%%i
+ )
+ if exist "!VS_PATH!\MSBuild\14.0\Bin\MSBuild.exe" (
+ set "MSBUILD_BIN=!VS_PATH!\MSBuild\14.0\Bin\MSBuild.exe"
+ )
+ if exist "!VS_PATH!\MSBuild\15.0\Bin\MSBuild.exe" (
+ set "MSBUILD_BIN=!VS_PATH!\MSBuild\15.0\Bin\MSBuild.exe"
+ )
+ if exist "!VS_PATH!\MSBuild\Current\Bin\MSBuild.exe" (
+ set "MSBUILD_BIN=!VS_PATH!\MSBuild\Current\Bin\MSBuild.exe"
+ )
+)
+
+if "!MSBUILD_BIN!" == "" (
+ if exist "C:\Program Files (x86)\MSBuild\14.0\Bin\MSBuild.exe" (
+ set "MSBUILD_BIN=C:\Program Files (x86)\MSBuild\14.0\Bin\MSBuild.exe"
+ set "MSBUILD_VERSION=14 2015"
+ )
+ if exist "C:\Program Files (x86)\Microsoft Visual Studio\2017\BuildTools\MSBuild\15.0\Bin\MSBuild.exe" (
+ set "MSBUILD_BIN=C:\Program Files (x86)\Microsoft Visual Studio\2017\BuildTools\MSBuild\15.0\Bin\MSBuild.exe"
+ set "MSBUILD_VERSION=15 2017"
+ )
+ if exist "C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\MSBuild\15.0\Bin\MSBuild.exe" (
+ set "MSBUILD_BIN=C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\MSBuild\15.0\Bin\MSBuild.exe"
+ set "MSBUILD_VERSION=15 2017"
+ )
+ if exist "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\MSBuild\15.0\Bin\MSBuild.exe" (
+ set "MSBUILD_BIN=C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\MSBuild\15.0\Bin\MSBuild.exe"
+ set "MSBUILD_VERSION=15 2017"
+ )
+) else (
+ if not "!MSBUILD_BIN:2019=!"=="!MSBUILD_BIN!" set "MSBUILD_VERSION=16 2019"
+ if not "!MSBUILD_BIN:2017=!"=="!MSBUILD_BIN!" set "MSBUILD_VERSION=15 2017"
+ if not "!MSBUILD_BIN:2015=!"=="!MSBUILD_BIN!" set "MSBUILD_VERSION=14 2015"
+)
+
+if "!MSBUILD_BIN!" == "" (
+ echo Build tools for Visual Studio 2015 / 2017 / 2019 cannot be found. If you use Visual Studio 2017, please download and install build tools from https://www.visualstudio.com/downloads/#build-tools-for-visual-studio-2017
+ GOTO errorHandling
+)
+
+set "SOLUTION_DIR64=%BUILD_FOLDER%\inference_engine_samples_build"
+
+echo Creating Visual Studio !MSBUILD_VERSION! %PLATFORM% files in %SOLUTION_DIR64%... && ^
+if exist "%SOLUTION_DIR64%\CMakeCache.txt" del "%SOLUTION_DIR64%\CMakeCache.txt"
+cd "%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\samples\cpp" && cmake -E make_directory "%SOLUTION_DIR64%" && cd "%SOLUTION_DIR64%" && cmake -G "Visual Studio !MSBUILD_VERSION!" -A %PLATFORM% "%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\samples\cpp"
+if ERRORLEVEL 1 GOTO errorHandling
+
+timeout 7
+
+echo.
+echo ###############^|^| Build Inference Engine samples using MS Visual Studio (MSBuild.exe) ^|^|###############
+echo.
+timeout 3
+echo !MSBUILD_BIN!" Samples.sln /p:Configuration=Release /t:classification_sample_async /clp:ErrorsOnly /m
+"!MSBUILD_BIN!" Samples.sln /p:Configuration=Release /t:classification_sample_async /clp:ErrorsOnly /m
+
+if ERRORLEVEL 1 GOTO errorHandling
+
+timeout 7
+
+:runSample
+echo.
+echo ###############^|^| Run Inference Engine classification sample ^|^|###############
+echo.
+timeout 3
+copy /Y "%ROOT_DIR%%model_name%.labels" "%ir_dir%"
+cd "%SOLUTION_DIR64%\intel64\Release"
+
+echo classification_sample_async.exe -i "%target_image_path%" -m "%ir_dir%\%model_name%.xml" -d !TARGET! !SAMPLE_OPTIONS!
+classification_sample_async.exe -i "%target_image_path%" -m "%ir_dir%\%model_name%.xml" -d !TARGET! !SAMPLE_OPTIONS!
+
+if ERRORLEVEL 1 GOTO errorHandling
+
+echo.
+echo ###############^|^| Classification demo completed successfully ^|^|###############
+
+timeout 10
+cd "%ROOT_DIR%"
+
+goto :eof
+
+:errorHandling
+echo Error
+cd "%ROOT_DIR%"
--- /dev/null
+#!/usr/bin/env bash
+
+# Copyright (C) 2018-2019 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+. "$ROOT_DIR/utils.sh"
+
+usage() {
+ echo "Classification demo using public SqueezeNet topology"
+ echo "-d name specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD are acceptable. Sample will look for a suitable plugin for device specified"
+ echo "-help print help message"
+ exit 1
+}
+
+trap 'error ${LINENO}' ERR
+
+target="CPU"
+
+# parse command line options
+while [[ $# -gt 0 ]]
+do
+key="$1"
+
+case $key in
+ -h | -help | --help)
+ usage
+ ;;
+ -d)
+ target="$2"
+ echo target = "${target}"
+ shift
+ ;;
+ -sample-options)
+ sampleoptions="$2 $3 $4 $5 $6"
+ echo sample-options = "${sampleoptions}"
+ shift
+ ;;
+ *)
+ # unknown option
+ ;;
+esac
+shift
+done
+
+target_precision="FP16"
+
+printf "target_precision = ${target_precision}\n"
+
+models_path="$HOME/openvino_models/models"
+models_cache="$HOME/openvino_models/cache"
+irs_path="$HOME/openvino_models/ir"
+
+model_name="squeezenet1.1"
+
+target_image_path="$ROOT_DIR/car.png"
+
+run_again="Then run the script again\n\n"
+dashes="\n\n###################################################\n\n"
+
+
+if [ -e "$ROOT_DIR/../../bin/setupvars.sh" ]; then
+ setupvars_path="$ROOT_DIR/../../bin/setupvars.sh"
+else
+ printf "Error: setupvars.sh is not found\n"
+fi
+
+if ! . $setupvars_path ; then
+ printf "Unable to run ./setupvars.sh. Please check its presence. ${run_again}"
+ exit 1
+fi
+
+# Step 1. Download the Caffe model and the prototxt of the model
+printf "${dashes}"
+printf "\n\nDownloading the Caffe model and the prototxt"
+
+cur_path=$PWD
+
+printf "\nInstalling dependencies\n"
+
+if [[ -f /etc/centos-release ]]; then
+ DISTRO="centos"
+elif [[ -f /etc/lsb-release ]]; then
+ DISTRO="ubuntu"
+fi
+
+if [[ $DISTRO == "centos" ]]; then
+ sudo -E yum install -y centos-release-scl epel-release
+ sudo -E yum install -y gcc gcc-c++ make glibc-static glibc-devel libstdc++-static libstdc++-devel libstdc++ libgcc \
+ glibc-static.i686 glibc-devel.i686 libstdc++-static.i686 libstdc++.i686 libgcc.i686 cmake
+
+ sudo -E rpm -Uvh http://li.nux.ro/download/nux/dextop/el7/x86_64/nux-dextop-release-0-1.el7.nux.noarch.rpm || true
+ sudo -E yum install -y epel-release
+ sudo -E yum install -y cmake ffmpeg gstreamer1 gstreamer1-plugins-base libusbx-devel
+
+ # check installed Python version
+ if command -v python3.5 >/dev/null 2>&1; then
+ python_binary=python3.5
+ pip_binary=pip3.5
+ fi
+ if command -v python3.6 >/dev/null 2>&1; then
+ python_binary=python3.6
+ pip_binary=pip3.6
+ fi
+ if [ -z "$python_binary" ]; then
+ sudo -E yum install -y rh-python36 || true
+ . scl_source enable rh-python36
+ python_binary=python3.6
+ pip_binary=pip3.6
+ fi
+elif [[ $DISTRO == "ubuntu" ]]; then
+ sudo -E apt update
+ print_and_run sudo -E apt -y install build-essential python3-pip virtualenv cmake libcairo2-dev libpango1.0-dev libglib2.0-dev libgtk2.0-dev libswscale-dev libavcodec-dev libavformat-dev libgstreamer1.0-0 gstreamer1.0-plugins-base
+ python_binary=python3
+ pip_binary=pip3
+
+ system_ver=`cat /etc/lsb-release | grep -i "DISTRIB_RELEASE" | cut -d "=" -f2`
+ if [ $system_ver = "18.04" ]; then
+ sudo -E apt-get install -y libpng-dev
+ else
+ sudo -E apt-get install -y libpng12-dev
+ fi
+elif [[ "$OSTYPE" == "darwin"* ]]; then
+ # check installed Python version
+ if command -v python3.7 >/dev/null 2>&1; then
+ python_binary=python3.7
+ pip_binary=pip3.7
+ elif command -v python3.6 >/dev/null 2>&1; then
+ python_binary=python3.6
+ pip_binary=pip3.6
+ elif command -v python3.5 >/dev/null 2>&1; then
+ python_binary=python3.5
+ pip_binary=pip3.5
+ else
+ python_binary=python3
+ pip_binary=pip3
+ fi
+fi
+
+if ! command -v $python_binary &>/dev/null; then
+ printf "\n\nPython 3.5 (x64) or higher is not installed. It is required to run Model Optimizer, please install it. ${run_again}"
+ exit 1
+fi
+
+if [[ "$OSTYPE" == "darwin"* ]]; then
+ $pip_binary install -r $ROOT_DIR/../open_model_zoo/tools/downloader/requirements.in
+else
+ sudo -E $pip_binary install -r $ROOT_DIR/../open_model_zoo/tools/downloader/requirements.in
+fi
+
+downloader_dir="${INTEL_OPENVINO_DIR}/deployment_tools/open_model_zoo/tools/downloader"
+
+model_dir=$("$python_binary" "$downloader_dir/info_dumper.py" --name "$model_name" |
+ "$python_binary" -c 'import sys, json; print(json.load(sys.stdin)[0]["subdirectory"])')
+
+downloader_path="$downloader_dir/downloader.py"
+
+print_and_run "$python_binary" "$downloader_path" --name "$model_name" --output_dir "${models_path}" --cache_dir "${models_cache}"
+
+ir_dir="${irs_path}/${model_dir}/${target_precision}"
+
+if [ ! -e "$ir_dir" ]; then
+ # Step 2. Configure Model Optimizer
+ printf "${dashes}"
+ printf "Install Model Optimizer dependencies\n\n"
+ cd "${INTEL_OPENVINO_DIR}/deployment_tools/model_optimizer/install_prerequisites"
+ . ./install_prerequisites.sh caffe
+ cd $cur_path
+
+ # Step 3. Convert a model with Model Optimizer
+ printf "${dashes}"
+ printf "Convert a model with Model Optimizer\n\n"
+
+ mo_path="${INTEL_OPENVINO_DIR}/deployment_tools/model_optimizer/mo.py"
+
+ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=cpp
+ print_and_run "$python_binary" "$downloader_dir/converter.py" --mo "$mo_path" --name "$model_name" -d "$models_path" -o "$irs_path" --precisions "$target_precision"
+else
+ printf "\n\nTarget folder ${ir_dir} already exists. Skipping IR generation with Model Optimizer."
+ printf "If you want to convert a model again, remove the entire ${ir_dir} folder. ${run_again}"
+fi
+
+# Step 4. Build samples
+printf "${dashes}"
+printf "Build Inference Engine samples\n\n"
+
+OS_PATH=$(uname -m)
+NUM_THREADS="-j2"
+
+if [ $OS_PATH == "x86_64" ]; then
+ OS_PATH="intel64"
+ NUM_THREADS="-j8"
+fi
+
+samples_path="${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/samples/cpp"
+build_dir="$HOME/inference_engine_samples_build"
+binaries_dir="${build_dir}/${OS_PATH}/Release"
+
+if [ -e $build_dir/CMakeCache.txt ]; then
+ rm -rf $build_dir/CMakeCache.txt
+fi
+mkdir -p $build_dir
+cd $build_dir
+cmake -DCMAKE_BUILD_TYPE=Release $samples_path
+
+make $NUM_THREADS classification_sample_async
+
+# Step 5. Run samples
+printf "${dashes}"
+printf "Run Inference Engine classification sample\n\n"
+
+cd $binaries_dir
+
+cp -f $ROOT_DIR/${model_name}.labels ${ir_dir}/
+
+print_and_run ./classification_sample_async -d "$target" -i "$target_image_path" -m "${ir_dir}/${model_name}.xml" ${sampleoptions}
+
+printf "${dashes}"
+
+printf "Demo completed successfully.\n\n"
--- /dev/null
+tench, Tinca tinca
+goldfish, Carassius auratus
+great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias
+tiger shark, Galeocerdo cuvieri
+hammerhead, hammerhead shark
+electric ray, crampfish, numbfish, torpedo
+stingray
+cock
+hen
+ostrich, Struthio camelus
+brambling, Fringilla montifringilla
+goldfinch, Carduelis carduelis
+house finch, linnet, Carpodacus mexicanus
+junco, snowbird
+indigo bunting, indigo finch, indigo bird, Passerina cyanea
+robin, American robin, Turdus migratorius
+bulbul
+jay
+magpie
+chickadee
+water ouzel, dipper
+kite
+bald eagle, American eagle, Haliaeetus leucocephalus
+vulture
+great grey owl, great gray owl, Strix nebulosa
+European fire salamander, Salamandra salamandra
+common newt, Triturus vulgaris
+eft
+spotted salamander, Ambystoma maculatum
+axolotl, mud puppy, Ambystoma mexicanum
+bullfrog, Rana catesbeiana
+tree frog, tree-frog
+tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui
+loggerhead, loggerhead turtle, Caretta caretta
+leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea
+mud turtle
+terrapin
+box turtle, box tortoise
+banded gecko
+common iguana, iguana, Iguana iguana
+American chameleon, anole, Anolis carolinensis
+whiptail, whiptail lizard
+agama
+frilled lizard, Chlamydosaurus kingi
+alligator lizard
+Gila monster, Heloderma suspectum
+green lizard, Lacerta viridis
+African chameleon, Chamaeleo chamaeleon
+Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis
+African crocodile, Nile crocodile, Crocodylus niloticus
+American alligator, Alligator mississipiensis
+triceratops
+thunder snake, worm snake, Carphophis amoenus
+ringneck snake, ring-necked snake, ring snake
+hognose snake, puff adder, sand viper
+green snake, grass snake
+king snake, kingsnake
+garter snake, grass snake
+water snake
+vine snake
+night snake, Hypsiglena torquata
+boa constrictor, Constrictor constrictor
+rock python, rock snake, Python sebae
+Indian cobra, Naja naja
+green mamba
+sea snake
+horned viper, cerastes, sand viper, horned asp, Cerastes cornutus
+diamondback, diamondback rattlesnake, Crotalus adamanteus
+sidewinder, horned rattlesnake, Crotalus cerastes
+trilobite
+harvestman, daddy longlegs, Phalangium opilio
+scorpion
+black and gold garden spider, Argiope aurantia
+barn spider, Araneus cavaticus
+garden spider, Aranea diademata
+black widow, Latrodectus mactans
+tarantula
+wolf spider, hunting spider
+tick
+centipede
+black grouse
+ptarmigan
+ruffed grouse, partridge, Bonasa umbellus
+prairie chicken, prairie grouse, prairie fowl
+peacock
+quail
+partridge
+African grey, African gray, Psittacus erithacus
+macaw
+sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita
+lorikeet
+coucal
+bee eater
+hornbill
+hummingbird
+jacamar
+toucan
+drake
+red-breasted merganser, Mergus serrator
+goose
+black swan, Cygnus atratus
+tusker
+echidna, spiny anteater, anteater
+platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus
+wallaby, brush kangaroo
+koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus
+wombat
+jellyfish
+sea anemone, anemone
+brain coral
+flatworm, platyhelminth
+nematode, nematode worm, roundworm
+conch
+snail
+slug
+sea slug, nudibranch
+chiton, coat-of-mail shell, sea cradle, polyplacophore
+chambered nautilus, pearly nautilus, nautilus
+Dungeness crab, Cancer magister
+rock crab, Cancer irroratus
+fiddler crab
+king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica
+American lobster, Northern lobster, Maine lobster, Homarus americanus
+spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish
+crayfish, crawfish, crawdad, crawdaddy
+hermit crab
+isopod
+white stork, Ciconia ciconia
+black stork, Ciconia nigra
+spoonbill
+flamingo
+little blue heron, Egretta caerulea
+American egret, great white heron, Egretta albus
+bittern
+crane
+limpkin, Aramus pictus
+European gallinule, Porphyrio porphyrio
+American coot, marsh hen, mud hen, water hen, Fulica americana
+bustard
+ruddy turnstone, Arenaria interpres
+red-backed sandpiper, dunlin, Erolia alpina
+redshank, Tringa totanus
+dowitcher
+oystercatcher, oyster catcher
+pelican
+king penguin, Aptenodytes patagonica
+albatross, mollymawk
+grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus
+killer whale, killer, orca, grampus, sea wolf, Orcinus orca
+dugong, Dugong dugon
+sea lion
+Chihuahua
+Japanese spaniel
+Maltese dog, Maltese terrier, Maltese
+Pekinese, Pekingese, Peke
+Shih-Tzu
+Blenheim spaniel
+papillon
+toy terrier
+Rhodesian ridgeback
+Afghan hound, Afghan
+basset, basset hound
+beagle
+bloodhound, sleuthhound
+bluetick
+black-and-tan coonhound
+Walker hound, Walker foxhound
+English foxhound
+redbone
+borzoi, Russian wolfhound
+Irish wolfhound
+Italian greyhound
+whippet
+Ibizan hound, Ibizan Podenco
+Norwegian elkhound, elkhound
+otterhound, otter hound
+Saluki, gazelle hound
+Scottish deerhound, deerhound
+Weimaraner
+Staffordshire bullterrier, Staffordshire bull terrier
+American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier
+Bedlington terrier
+Border terrier
+Kerry blue terrier
+Irish terrier
+Norfolk terrier
+Norwich terrier
+Yorkshire terrier
+wire-haired fox terrier
+Lakeland terrier
+Sealyham terrier, Sealyham
+Airedale, Airedale terrier
+cairn, cairn terrier
+Australian terrier
+Dandie Dinmont, Dandie Dinmont terrier
+Boston bull, Boston terrier
+miniature schnauzer
+giant schnauzer
+standard schnauzer
+Scotch terrier, Scottish terrier, Scottie
+Tibetan terrier, chrysanthemum dog
+silky terrier, Sydney silky
+soft-coated wheaten terrier
+West Highland white terrier
+Lhasa, Lhasa apso
+flat-coated retriever
+curly-coated retriever
+golden retriever
+Labrador retriever
+Chesapeake Bay retriever
+German short-haired pointer
+vizsla, Hungarian pointer
+English setter
+Irish setter, red setter
+Gordon setter
+Brittany spaniel
+clumber, clumber spaniel
+English springer, English springer spaniel
+Welsh springer spaniel
+cocker spaniel, English cocker spaniel, cocker
+Sussex spaniel
+Irish water spaniel
+kuvasz
+schipperke
+groenendael
+malinois
+briard
+kelpie
+komondor
+Old English sheepdog, bobtail
+Shetland sheepdog, Shetland sheep dog, Shetland
+collie
+Border collie
+Bouvier des Flandres, Bouviers des Flandres
+Rottweiler
+German shepherd, German shepherd dog, German police dog, alsatian
+Doberman, Doberman pinscher
+miniature pinscher
+Greater Swiss Mountain dog
+Bernese mountain dog
+Appenzeller
+EntleBucher
+boxer
+bull mastiff
+Tibetan mastiff
+French bulldog
+Great Dane
+Saint Bernard, St Bernard
+Eskimo dog, husky
+malamute, malemute, Alaskan malamute
+Siberian husky
+dalmatian, coach dog, carriage dog
+affenpinscher, monkey pinscher, monkey dog
+basenji
+pug, pug-dog
+Leonberg
+Newfoundland, Newfoundland dog
+Great Pyrenees
+Samoyed, Samoyede
+Pomeranian
+chow, chow chow
+keeshond
+Brabancon griffon
+Pembroke, Pembroke Welsh corgi
+Cardigan, Cardigan Welsh corgi
+toy poodle
+miniature poodle
+standard poodle
+Mexican hairless
+timber wolf, grey wolf, gray wolf, Canis lupus
+white wolf, Arctic wolf, Canis lupus tundrarum
+red wolf, maned wolf, Canis rufus, Canis niger
+coyote, prairie wolf, brush wolf, Canis latrans
+dingo, warrigal, warragal, Canis dingo
+dhole, Cuon alpinus
+African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus
+hyena, hyaena
+red fox, Vulpes vulpes
+kit fox, Vulpes macrotis
+Arctic fox, white fox, Alopex lagopus
+grey fox, gray fox, Urocyon cinereoargenteus
+tabby, tabby cat
+tiger cat
+Persian cat
+Siamese cat, Siamese
+Egyptian cat
+cougar, puma, catamount, mountain lion, painter, panther, Felis concolor
+lynx, catamount
+leopard, Panthera pardus
+snow leopard, ounce, Panthera uncia
+jaguar, panther, Panthera onca, Felis onca
+lion, king of beasts, Panthera leo
+tiger, Panthera tigris
+cheetah, chetah, Acinonyx jubatus
+brown bear, bruin, Ursus arctos
+American black bear, black bear, Ursus americanus, Euarctos americanus
+ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus
+sloth bear, Melursus ursinus, Ursus ursinus
+mongoose
+meerkat, mierkat
+tiger beetle
+ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle
+ground beetle, carabid beetle
+long-horned beetle, longicorn, longicorn beetle
+leaf beetle, chrysomelid
+dung beetle
+rhinoceros beetle
+weevil
+fly
+bee
+ant, emmet, pismire
+grasshopper, hopper
+cricket
+walking stick, walkingstick, stick insect
+cockroach, roach
+mantis, mantid
+cicada, cicala
+leafhopper
+lacewing, lacewing fly
+dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk
+damselfly
+admiral
+ringlet, ringlet butterfly
+monarch, monarch butterfly, milkweed butterfly, Danaus plexippus
+cabbage butterfly
+sulphur butterfly, sulfur butterfly
+lycaenid, lycaenid butterfly
+starfish, sea star
+sea urchin
+sea cucumber, holothurian
+wood rabbit, cottontail, cottontail rabbit
+hare
+Angora, Angora rabbit
+hamster
+porcupine, hedgehog
+fox squirrel, eastern fox squirrel, Sciurus niger
+marmot
+beaver
+guinea pig, Cavia cobaya
+sorrel
+zebra
+hog, pig, grunter, squealer, Sus scrofa
+wild boar, boar, Sus scrofa
+warthog
+hippopotamus, hippo, river horse, Hippopotamus amphibius
+ox
+water buffalo, water ox, Asiatic buffalo, Bubalus bubalis
+bison
+ram, tup
+bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis
+ibex, Capra ibex
+hartebeest
+impala, Aepyceros melampus
+gazelle
+Arabian camel, dromedary, Camelus dromedarius
+llama
+weasel
+mink
+polecat, fitch, foulmart, foumart, Mustela putorius
+black-footed ferret, ferret, Mustela nigripes
+otter
+skunk, polecat, wood pussy
+badger
+armadillo
+three-toed sloth, ai, Bradypus tridactylus
+orangutan, orang, orangutang, Pongo pygmaeus
+gorilla, Gorilla gorilla
+chimpanzee, chimp, Pan troglodytes
+gibbon, Hylobates lar
+siamang, Hylobates syndactylus, Symphalangus syndactylus
+guenon, guenon monkey
+patas, hussar monkey, Erythrocebus patas
+baboon
+macaque
+langur
+colobus, colobus monkey
+proboscis monkey, Nasalis larvatus
+marmoset
+capuchin, ringtail, Cebus capucinus
+howler monkey, howler
+titi, titi monkey
+spider monkey, Ateles geoffroyi
+squirrel monkey, Saimiri sciureus
+Madagascar cat, ring-tailed lemur, Lemur catta
+indri, indris, Indri indri, Indri brevicaudatus
+Indian elephant, Elephas maximus
+African elephant, Loxodonta africana
+lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens
+giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca
+barracouta, snoek
+eel
+coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch
+rock beauty, Holocanthus tricolor
+anemone fish
+sturgeon
+gar, garfish, garpike, billfish, Lepisosteus osseus
+lionfish
+puffer, pufferfish, blowfish, globefish
+abacus
+abaya
+academic gown, academic robe, judge's robe
+accordion, piano accordion, squeeze box
+acoustic guitar
+aircraft carrier, carrier, flattop, attack aircraft carrier
+airliner
+airship, dirigible
+altar
+ambulance
+amphibian, amphibious vehicle
+analog clock
+apiary, bee house
+apron
+ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin
+assault rifle, assault gun
+backpack, back pack, knapsack, packsack, rucksack, haversack
+bakery, bakeshop, bakehouse
+balance beam, beam
+balloon
+ballpoint, ballpoint pen, ballpen, Biro
+Band Aid
+banjo
+bannister, banister, balustrade, balusters, handrail
+barbell
+barber chair
+barbershop
+barn
+barometer
+barrel, cask
+barrow, garden cart, lawn cart, wheelbarrow
+baseball
+basketball
+bassinet
+bassoon
+bathing cap, swimming cap
+bath towel
+bathtub, bathing tub, bath, tub
+beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon
+beacon, lighthouse, beacon light, pharos
+beaker
+bearskin, busby, shako
+beer bottle
+beer glass
+bell cote, bell cot
+bib
+bicycle-built-for-two, tandem bicycle, tandem
+bikini, two-piece
+binder, ring-binder
+binoculars, field glasses, opera glasses
+birdhouse
+boathouse
+bobsled, bobsleigh, bob
+bolo tie, bolo, bola tie, bola
+bonnet, poke bonnet
+bookcase
+bookshop, bookstore, bookstall
+bottlecap
+bow
+bow tie, bow-tie, bowtie
+brass, memorial tablet, plaque
+brassiere, bra, bandeau
+breakwater, groin, groyne, mole, bulwark, seawall, jetty
+breastplate, aegis, egis
+broom
+bucket, pail
+buckle
+bulletproof vest
+bullet train, bullet
+butcher shop, meat market
+cab, hack, taxi, taxicab
+caldron, cauldron
+candle, taper, wax light
+cannon
+canoe
+can opener, tin opener
+cardigan
+car mirror
+carousel, carrousel, merry-go-round, roundabout, whirligig
+carpenter's kit, tool kit
+carton
+car wheel
+cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM
+cassette
+cassette player
+castle
+catamaran
+CD player
+cello, violoncello
+cellular telephone, cellular phone, cellphone, cell, mobile phone
+chain
+chainlink fence
+chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour
+chain saw, chainsaw
+chest
+chiffonier, commode
+chime, bell, gong
+china cabinet, china closet
+Christmas stocking
+church, church building
+cinema, movie theater, movie theatre, movie house, picture palace
+cleaver, meat cleaver, chopper
+cliff dwelling
+cloak
+clog, geta, patten, sabot
+cocktail shaker
+coffee mug
+coffeepot
+coil, spiral, volute, whorl, helix
+combination lock
+computer keyboard, keypad
+confectionery, confectionary, candy store
+container ship, containership, container vessel
+convertible
+corkscrew, bottle screw
+cornet, horn, trumpet, trump
+cowboy boot
+cowboy hat, ten-gallon hat
+cradle
+crane
+crash helmet
+crate
+crib, cot
+Crock Pot
+croquet ball
+crutch
+cuirass
+dam, dike, dyke
+desk
+desktop computer
+dial telephone, dial phone
+diaper, nappy, napkin
+digital clock
+digital watch
+dining table, board
+dishrag, dishcloth
+dishwasher, dish washer, dishwashing machine
+disk brake, disc brake
+dock, dockage, docking facility
+dogsled, dog sled, dog sleigh
+dome
+doormat, welcome mat
+drilling platform, offshore rig
+drum, membranophone, tympan
+drumstick
+dumbbell
+Dutch oven
+electric fan, blower
+electric guitar
+electric locomotive
+entertainment center
+envelope
+espresso maker
+face powder
+feather boa, boa
+file, file cabinet, filing cabinet
+fireboat
+fire engine, fire truck
+fire screen, fireguard
+flagpole, flagstaff
+flute, transverse flute
+folding chair
+football helmet
+forklift
+fountain
+fountain pen
+four-poster
+freight car
+French horn, horn
+frying pan, frypan, skillet
+fur coat
+garbage truck, dustcart
+gasmask, respirator, gas helmet
+gas pump, gasoline pump, petrol pump, island dispenser
+goblet
+go-kart
+golf ball
+golfcart, golf cart
+gondola
+gong, tam-tam
+gown
+grand piano, grand
+greenhouse, nursery, glasshouse
+grille, radiator grille
+grocery store, grocery, food market, market
+guillotine
+hair slide
+hair spray
+half track
+hammer
+hamper
+hand blower, blow dryer, blow drier, hair dryer, hair drier
+hand-held computer, hand-held microcomputer
+handkerchief, hankie, hanky, hankey
+hard disc, hard disk, fixed disk
+harmonica, mouth organ, harp, mouth harp
+harp
+harvester, reaper
+hatchet
+holster
+home theater, home theatre
+honeycomb
+hook, claw
+hoopskirt, crinoline
+horizontal bar, high bar
+horse cart, horse-cart
+hourglass
+iPod
+iron, smoothing iron
+jack-o'-lantern
+jean, blue jean, denim
+jeep, landrover
+jersey, T-shirt, tee shirt
+jigsaw puzzle
+jinrikisha, ricksha, rickshaw
+joystick
+kimono
+knee pad
+knot
+lab coat, laboratory coat
+ladle
+lampshade, lamp shade
+laptop, laptop computer
+lawn mower, mower
+lens cap, lens cover
+letter opener, paper knife, paperknife
+library
+lifeboat
+lighter, light, igniter, ignitor
+limousine, limo
+liner, ocean liner
+lipstick, lip rouge
+Loafer
+lotion
+loudspeaker, speaker, speaker unit, loudspeaker system, speaker system
+loupe, jeweler's loupe
+lumbermill, sawmill
+magnetic compass
+mailbag, postbag
+mailbox, letter box
+maillot
+maillot, tank suit
+manhole cover
+maraca
+marimba, xylophone
+mask
+matchstick
+maypole
+maze, labyrinth
+measuring cup
+medicine chest, medicine cabinet
+megalith, megalithic structure
+microphone, mike
+microwave, microwave oven
+military uniform
+milk can
+minibus
+miniskirt, mini
+minivan
+missile
+mitten
+mixing bowl
+mobile home, manufactured home
+Model T
+modem
+monastery
+monitor
+moped
+mortar
+mortarboard
+mosque
+mosquito net
+motor scooter, scooter
+mountain bike, all-terrain bike, off-roader
+mountain tent
+mouse, computer mouse
+mousetrap
+moving van
+muzzle
+nail
+neck brace
+necklace
+nipple
+notebook, notebook computer
+obelisk
+oboe, hautboy, hautbois
+ocarina, sweet potato
+odometer, hodometer, mileometer, milometer
+oil filter
+organ, pipe organ
+oscilloscope, scope, cathode-ray oscilloscope, CRO
+overskirt
+oxcart
+oxygen mask
+packet
+paddle, boat paddle
+paddlewheel, paddle wheel
+padlock
+paintbrush
+pajama, pyjama, pj's, jammies
+palace
+panpipe, pandean pipe, syrinx
+paper towel
+parachute, chute
+parallel bars, bars
+park bench
+parking meter
+passenger car, coach, carriage
+patio, terrace
+pay-phone, pay-station
+pedestal, plinth, footstall
+pencil box, pencil case
+pencil sharpener
+perfume, essence
+Petri dish
+photocopier
+pick, plectrum, plectron
+pickelhaube
+picket fence, paling
+pickup, pickup truck
+pier
+piggy bank, penny bank
+pill bottle
+pillow
+ping-pong ball
+pinwheel
+pirate, pirate ship
+pitcher, ewer
+plane, carpenter's plane, woodworking plane
+planetarium
+plastic bag
+plate rack
+plow, plough
+plunger, plumber's helper
+Polaroid camera, Polaroid Land camera
+pole
+police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria
+poncho
+pool table, billiard table, snooker table
+pop bottle, soda bottle
+pot, flowerpot
+potter's wheel
+power drill
+prayer rug, prayer mat
+printer
+prison, prison house
+projectile, missile
+projector
+puck, hockey puck
+punching bag, punch bag, punching ball, punchball
+purse
+quill, quill pen
+quilt, comforter, comfort, puff
+racer, race car, racing car
+racket, racquet
+radiator
+radio, wireless
+radio telescope, radio reflector
+rain barrel
+recreational vehicle, RV, R.V.
+reel
+reflex camera
+refrigerator, icebox
+remote control, remote
+restaurant, eating house, eating place, eatery
+revolver, six-gun, six-shooter
+rifle
+rocking chair, rocker
+rotisserie
+rubber eraser, rubber, pencil eraser
+rugby ball
+rule, ruler
+running shoe
+safe
+safety pin
+saltshaker, salt shaker
+sandal
+sarong
+sax, saxophone
+scabbard
+scale, weighing machine
+school bus
+schooner
+scoreboard
+screen, CRT screen
+screw
+screwdriver
+seat belt, seatbelt
+sewing machine
+shield, buckler
+shoe shop, shoe-shop, shoe store
+shoji
+shopping basket
+shopping cart
+shovel
+shower cap
+shower curtain
+ski
+ski mask
+sleeping bag
+slide rule, slipstick
+sliding door
+slot, one-armed bandit
+snorkel
+snowmobile
+snowplow, snowplough
+soap dispenser
+soccer ball
+sock
+solar dish, solar collector, solar furnace
+sombrero
+soup bowl
+space bar
+space heater
+space shuttle
+spatula
+speedboat
+spider web, spider's web
+spindle
+sports car, sport car
+spotlight, spot
+stage
+steam locomotive
+steel arch bridge
+steel drum
+stethoscope
+stole
+stone wall
+stopwatch, stop watch
+stove
+strainer
+streetcar, tram, tramcar, trolley, trolley car
+stretcher
+studio couch, day bed
+stupa, tope
+submarine, pigboat, sub, U-boat
+suit, suit of clothes
+sundial
+sunglass
+sunglasses, dark glasses, shades
+sunscreen, sunblock, sun blocker
+suspension bridge
+swab, swob, mop
+sweatshirt
+swimming trunks, bathing trunks
+swing
+switch, electric switch, electrical switch
+syringe
+table lamp
+tank, army tank, armored combat vehicle, armoured combat vehicle
+tape player
+teapot
+teddy, teddy bear
+television, television system
+tennis ball
+thatch, thatched roof
+theater curtain, theatre curtain
+thimble
+thresher, thrasher, threshing machine
+throne
+tile roof
+toaster
+tobacco shop, tobacconist shop, tobacconist
+toilet seat
+torch
+totem pole
+tow truck, tow car, wrecker
+toyshop
+tractor
+trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi
+tray
+trench coat
+tricycle, trike, velocipede
+trimaran
+tripod
+triumphal arch
+trolleybus, trolley coach, trackless trolley
+trombone
+tub, vat
+turnstile
+typewriter keyboard
+umbrella
+unicycle, monocycle
+upright, upright piano
+vacuum, vacuum cleaner
+vase
+vault
+velvet
+vending machine
+vestment
+viaduct
+violin, fiddle
+volleyball
+waffle iron
+wall clock
+wallet, billfold, notecase, pocketbook
+wardrobe, closet, press
+warplane, military plane
+washbasin, handbasin, washbowl, lavabo, wash-hand basin
+washer, automatic washer, washing machine
+water bottle
+water jug
+water tower
+whiskey jug
+whistle
+wig
+window screen
+window shade
+Windsor tie
+wine bottle
+wing
+wok
+wooden spoon
+wool, woolen, woollen
+worm fence, snake fence, snake-rail fence, Virginia fence
+wreck
+yawl
+yurt
+web site, website, internet site, site
+comic book
+crossword puzzle, crossword
+street sign
+traffic light, traffic signal, stoplight
+book jacket, dust cover, dust jacket, dust wrapper
+menu
+plate
+guacamole
+consomme
+hot pot, hotpot
+trifle
+ice cream, icecream
+ice lolly, lolly, lollipop, popsicle
+French loaf
+bagel, beigel
+pretzel
+cheeseburger
+hotdog, hot dog, red hot
+mashed potato
+head cabbage
+broccoli
+cauliflower
+zucchini, courgette
+spaghetti squash
+acorn squash
+butternut squash
+cucumber, cuke
+artichoke, globe artichoke
+bell pepper
+cardoon
+mushroom
+Granny Smith
+strawberry
+orange
+lemon
+fig
+pineapple, ananas
+banana
+jackfruit, jak, jack
+custard apple
+pomegranate
+hay
+carbonara
+chocolate sauce, chocolate syrup
+dough
+meat loaf, meatloaf
+pizza, pizza pie
+potpie
+burrito
+red wine
+espresso
+cup
+eggnog
+alp
+bubble
+cliff, drop, drop-off
+coral reef
+geyser
+lakeside, lakeshore
+promontory, headland, head, foreland
+sandbar, sand bar
+seashore, coast, seacoast, sea-coast
+valley, vale
+volcano
+ballplayer, baseball player
+groom, bridegroom
+scuba diver
+rapeseed
+daisy
+yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum
+corn
+acorn
+hip, rose hip, rosehip
+buckeye, horse chestnut, conker
+coral fungus
+agaric
+gyromitra
+stinkhorn, carrion fungus
+earthstar
+hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa
+bolete
+ear, spike, capitulum
+toilet tissue, toilet paper, bathroom tissue
--- /dev/null
+error() {
+ local code="${3:-1}"
+ if [[ -n "$2" ]]; then
+ echo "Error on or near line $1: $2; exiting with status ${code}"
+ else
+ echo "Error on or near line $1; exiting with status ${code}"
+ fi
+ exit "${code}"
+}
+
+print_and_run() {
+ printf 'Run'
+ printf ' %q' "$@"
+ printf '\n\n'
+ "$@"
+}
--- /dev/null
+#!/bin/bash -x
+
+# Copyright (c) 2018-2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script installs Linux kernel 4.14 required for Intel NEO OpenCL driver on Ubuntu and CentOS
+
+if grep -i "rhel" /etc/os-release &>/dev/null; then
+ # Cent OS
+ echo "install kernel build dependencies"
+ sudo -E yum install -y git gcc gcc-c++ ncurses-devel openssl-devel bc xz elfutils-libelf-devel xorg-x11-drv-nouveau rpm-build
+
+ echo "download 4.14.20 kernel"
+ if [ ! -f ./linux-4.14.20.tar.xz ]; then
+ wget https://www.kernel.org/pub/linux/kernel/v4.x/linux-4.14.20.tar.xz
+ fi
+
+ tar -xJf linux-4.14.20.tar.xz
+ cd linux-4.14.20
+ echo "build 4.14.20 kernel"
+ make olddefconfig
+
+ make -j 8 binrpm-pkg
+ cd ~/rpmbuild/RPMS/x86_64
+ sudo -E yum -y localinstall *.rpm
+ sudo -E grub2-set-default 0
+
+elif grep -i "ubuntu" /etc/os-release &>/dev/null; then
+ # Ubuntu
+ sudo -E add-apt-repository ppa:teejee2008/ppa
+ sudo -E apt-get update && sudo apt-get install -y ukuu
+ sudo -E ukuu --install v4.14.20
+fi
\ No newline at end of file
--- /dev/null
+#!/bin/bash
+
+# Copyright (c) 2020 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+if [ $EUID -ne 0 ]; then
+ echo "ERROR: this script must be run as root to install 3rd party packages." >&2
+ echo "Please try again with \"sudo -E $0\", or as root." >&2
+ exit 1
+fi
+
+params=$@
+
+yes_or_no() {
+ if [ "$params" == "-y" ]; then
+ return 0
+ fi
+
+ while true; do
+ read -p "Add third-party repositories and install GStreamer Plugins (y/n): " yn
+ case $yn in
+ [Yy]*) return 0 ;;
+ [Nn]*) return 1 ;;
+ esac
+ done
+}
+
+echo
+echo "This script installs the following GStreamer 3rd-party dependencies:"
+echo " 1. build dependencies for GStreamer plugin bad"
+echo " 2. build dependencies for GStreamer plugin ugly"
+echo " 3. build dependencies for GStreamer plugin vaapi"
+echo
+
+if [ -f /etc/lsb-release ]; then
+ # Ubuntu
+ PKGS=(
+ libbluetooth-dev
+ libusb-1.0.0-dev
+ libass-dev
+ libbs2b-dev
+ libchromaprint-dev
+ liblcms2-dev
+ libssh2-1-dev
+ libdc1394-22-dev
+ libdirectfb-dev
+ libssh-dev
+ libdca-dev
+ libfaac-dev
+ libfaad-dev
+ libfdk-aac-dev
+ flite1-dev
+ libfluidsynth-dev
+ libgme-dev
+ libgsm1-dev
+ nettle-dev
+ libkate-dev
+ liblrdf0-dev
+ libde265-dev
+ libmjpegtools-dev
+ libmms-dev
+ libmodplug-dev
+ libmpcdec-dev
+ libneon27-dev
+ libofa0-dev
+ libopenal-dev
+ libopenexr-dev
+ libopenjp2-7-dev
+ libopenmpt-dev
+ libopenni2-dev
+ libdvdnav-dev
+ librtmp-dev
+ librsvg2-dev
+ libsbc-dev
+ libsndfile1-dev
+ libsoundtouch-dev
+ libspandsp-dev
+ libsrtp2-dev
+ libzvbi-dev
+ libvo-aacenc-dev
+ libvo-amrwbenc-dev
+ libwebrtc-audio-processing-dev
+ libwebp-dev
+ libwildmidi-dev
+ libzbar-dev
+ libnice-dev
+ libx265-dev
+ libxkbcommon-dev
+ libx264-dev
+ libmpeg2-4-dev
+ libdvdread-dev
+ libcdio-dev
+ libopencore-amrnb-dev
+ libopencore-amrwb-dev
+ liba52-0.7.4-dev
+ libsidplay1-dev
+ libva-dev
+ libxrandr-dev
+ libudev-dev
+ python-gi-dev \
+ python3-dev
+ )
+ apt update
+ apt install -y ${PKGS[@]}
+else
+ # CentOS
+ PKGS=(
+ bluez-libs-devel
+ libusb-devel
+ libass-devel
+ libbs2b-devel
+ libchromaprint-devel
+ lcms2-devel
+ libssh2-devel
+ libdc1394-devel
+ libXext-devel
+ libssh-devel
+ libdca-devel
+ faac-devel
+ faad2-devel
+ fdk-aac-devel
+ flite-devel
+ fluidsynth-devel
+ game-music-emu-devel
+ gsm-devel
+ nettle-devel
+ kate-devel
+ liblrdf-devel
+ libde265-devel
+ mjpegtools-devel
+ libmms-devel
+ libmodplug-devel
+ libmpcdec-devel
+ neon-devel
+ libofa-devel
+ openal-soft-devel
+ OpenEXR-devel
+ openjpeg2-devel
+ openni-devel
+ libdvdnav-devel
+ librtmp-devel
+ librsvg2-devel
+ sbc-devel
+ libsndfile-devel
+ soundtouch-devel
+ spandsp-devel
+ libsrtp-devel
+ zvbi-devel
+ vo-amrwbenc-devel
+ webrtc-audio-processing-devel
+ wildmidi-devel
+ zbar-devel
+ libnice-devel
+ x265-devel
+ libxkbcommon-devel
+ x264-devel
+ libmpeg2-devel
+ libcdio-devel
+ opencore-amr-devel
+ libva-devel
+ python36-gobject-devel
+ python3-devel
+ )
+ if yes_or_no; then
+ rpm --import http://li.nux.ro/download/nux/RPM-GPG-KEY-nux.ro
+ yum install -y epel-release
+ rpm -Uvh http://li.nux.ro/download/nux/dextop/el7/x86_64/nux-dextop-release-0-5.el7.nux.noarch.rpm
+ yum install -y ${PKGS[@]}
+ else
+ echo
+ echo "Plugins cannot be installed without adding repositories:"
+ echo " PM-GPG-KEY-nux, epel-release, nux-dextop-release-0-5."
+ echo
+ fi
+ exit
+fi
--- /dev/null
+# Copyright (c) 2018-2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo "Updating udev rules..."
+
+if [ -z "$INTEL_OPENVINO_DIR" ]; then
+ echo "Please set up your environment. Run 'source <OPENVINO_INSTALLDIR>/bin/setupvars.sh'."
+ exit -1
+fi
+
+if [ -f "$INTEL_OPENVINO_DIR/deployment_tools/inference_engine/external/97-myriad-usbboot.rules" ]; then
+ sudo usermod -a -G users "$(whoami)"
+
+ sudo cp "$INTEL_OPENVINO_DIR/deployment_tools/inference_engine/external/97-myriad-usbboot.rules" /etc/udev/rules.d/
+ sudo udevadm control --reload-rules
+ sudo udevadm trigger
+ sudo ldconfig
+ echo "Udev rules have been successfully installed."
+else
+ echo "File '97-myriad-usbboot.rules' is missing. Please make sure you installed 'Inference Engine Runtime for Intel® Movidius™ VPU'."
+ exit -1
+fi
+
+
--- /dev/null
+#!/bin/bash
+
+# Copyright (c) 2018 - 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# Installs the Graphics Driver for OpenCL on Linux.
+#
+# Usage: sudo -E ./install_NEO_OCL_driver.sh
+#
+# Supported platforms:
+# 6th, 7th, 8th or 9th generation Intel® processor with Intel(R)
+# Processor Graphics Technology not previously disabled by the BIOS
+# or motherboard settings
+#
+EXIT_FAILURE=1
+UBUNTU_VERSION=
+DISTRO=
+
+
+params=$@
+yes_or_no() {
+ if [ "$params" == "-y" ]; then
+ return 1
+ fi
+
+ while true; do
+ read -p "Do you want to continue: " yn
+ case $yn in
+ [Yy]*) return 1 ;;
+ [Nn]*) return 0 ;;
+ esac
+ done
+}
+
+
+_install_prerequisites_centos()
+{
+ # yum doesn't accept timeout in seconds as parameter
+ echo
+ echo "Note: if yum becomes non-responsive, try aborting the script and run:"
+ echo " sudo -E $0"
+ echo
+
+ CMDS=("yum -y install tar libpciaccess numactl-libs"
+ "yum -y groupinstall 'Development Tools'"
+ "yum -y install rpmdevtools openssl openssl-devel bc numactl ocl-icd ocl-icd-devel")
+
+ for cmd in "${CMDS[@]}"; do
+ echo $cmd
+ eval $cmd
+ if [[ $? -ne 0 ]]; then
+ echo ERROR: failed to run $cmd >&2
+ echo Problem \(or disk space\)? >&2
+ echo . Verify that you have enough disk space, and run the script again. >&2
+ exit $EXIT_FAILURE
+ fi
+ done
+
+}
+
+_install_prerequisites_ubuntu()
+{
+ CMDS=("apt-get -y update"
+ "apt-get -y install libnuma1 ocl-icd-libopencl1")
+
+ for cmd in "${CMDS[@]}"; do
+ echo $cmd
+ eval $cmd
+ if [[ $? -ne 0 ]]; then
+ echo ERROR: failed to run $cmd >&2
+ echo Problem \(or disk space\)? >&2
+ echo " sudo -E $0" >&2
+ echo 2. Verify that you have enough disk space, and run the script again. >&2
+ exit $EXIT_FAILURE
+ fi
+ done
+}
+
+install_prerequisites()
+{
+ if [[ $DISTRO == "centos" ]]; then
+ echo Installing prerequisites...
+ _install_prerequisites_centos
+ elif [[ $DISTRO == "ubuntu" ]]; then
+ echo Installing prerequisites...
+ _install_prerequisites_ubuntu
+ else
+ echo Unknown OS
+ fi
+}
+
+_deploy_rpm()
+{
+ # On a CentOS 7.2 machine with Intel Parallel Composer XE 2017
+ # installed we got conflicts when trying to deploy these rpms.
+ # If that happens to you too, try again with:
+ # IGFX_RPM_FLAGS="--force" sudo -E ./install_NEO_OCL_driver.sh install
+ #
+ cmd="rpm $IGFX_RPM_FLAGS -ivh --nodeps --force $1"
+ echo $cmd
+ eval $cmd
+}
+
+_deploy_deb()
+{
+ cmd="dpkg -i $1"
+ echo $cmd
+ eval $cmd
+}
+
+_install_user_mode_centos()
+{
+ _deploy_rpm "intel*.rpm"
+ if [[ $? -ne 0 ]]; then
+ echo ERROR: failed to install rpms $cmd error >&2
+ echo Make sure you have enough disk space or fix the problem manually and try again. >&2
+ exit $EXIT_FAILURE
+ fi
+}
+
+_install_user_mode_ubuntu()
+{
+ _deploy_deb "intel*.deb"
+ if [[ $? -ne 0 ]]; then
+ echo ERROR: failed to install rpms $cmd error >&2
+ echo Make sure you have enough disk space or fix the problem manually and try again. >&2
+ exit $EXIT_FAILURE
+ fi
+}
+
+install_user_mode()
+{
+ echo Installing user mode driver...
+
+ if [[ $DISTRO == "centos" ]]; then
+ _install_user_mode_centos
+ else
+ _install_user_mode_ubuntu
+ fi
+
+}
+
+_uninstall_user_mode_centos()
+{
+ echo Looking for previously installed user-mode driver...
+ PACKAGES=("intel-opencl"
+ "intel-ocloc"
+ "intel-gmmlib"
+ "intel-igc-core"
+ "intel-igc-opencl")
+ for package in "${PACKAGES[@]}"; do
+ echo "rpm -qa | grep $package"
+ found_package=$(rpm -qa | grep $package)
+ if [[ $? -eq 0 ]]; then
+ echo Found installed user-mode driver, performing uninstall...
+ cmd="rpm -e --nodeps ${found_package}"
+ echo $cmd
+ eval $cmd
+ if [[ $? -ne 0 ]]; then
+ echo ERROR: failed to uninstall existing user-mode driver. >&2
+ echo Please try again manually and run the script again. >&2
+ exit $EXIT_FAILURE
+ fi
+ fi
+ done
+}
+
+_uninstall_user_mode_ubuntu()
+{
+ echo Looking for previously installed user-mode driver...
+
+ PACKAGES=("intel-opencl"
+ "intel-ocloc"
+ "intel-gmmlib"
+ "intel-igc-core"
+ "intel-igc-opencl")
+
+ for package in "${PACKAGES[@]}"; do
+ found_package=$(dpkg-query -W -f='${binary:Package}\n' ${package})
+ if [[ $? -eq 0 ]]; then
+ echo Found $found_package installed, uninstalling...
+ dpkg --purge $found_package
+ if [[ $? -ne 0 ]]; then
+ echo "ERROR: unable to remove $found_package" >&2
+ echo " please resolve it manually and try to launch the script again." >&2
+ exit $EXIT_FAILURE
+ fi
+ fi
+ done
+}
+
+uninstall_user_mode()
+{
+ if [[ $DISTRO == "centos" ]]; then
+ _uninstall_user_mode_centos
+ else
+ _uninstall_user_mode_ubuntu
+ fi
+}
+
+version_gt() {
+ # check if first version is greater than second version
+ test "$(printf '%s\n' "$@" | sort -V | head -n 1)" != "$1";
+}
+
+summary()
+{
+ kernel_version=$(uname -r)
+
+ echo
+ echo Installation completed successfully.
+ echo
+ echo Next steps:
+ echo "Add OpenCL users to the video group: 'sudo usermod -a -G video USERNAME'"
+ echo " e.g. if the user running OpenCL host applications is foo, run: sudo usermod -a -G video foo"
+ echo " Current user has been already added to the video group"
+ echo
+
+ # ask to install kernel 4.14 if current kernel version < 4.13 (GPU NEO driver supports only kernels 4.13.x and higher)
+ if version_gt "4.13" "$kernel_version" ; then
+ echo "Install 4.14 kernel using install_4_14_kernel.sh script and reboot into this kernel"
+ echo
+ fi
+
+ echo "If you use 8th Generation Intel® Core™ processor, you will need to add:"
+ echo " i915.alpha_support=1"
+ echo " to the 4.14 kernel command line, in order to enable OpenCL functionality for this platform."
+ echo
+
+}
+
+check_root_access()
+{
+ if [[ $EUID -ne 0 ]]; then
+ echo "ERROR: you must run this script as root." >&2
+ echo "Please try again with "sudo -E $0", or as root." >&2
+ exit $EXIT_FAILURE
+ fi
+}
+
+add_user_to_video_group()
+{
+ local real_user=$(logname 2>/dev/null || echo ${SUDO_USER:-${USER}})
+ echo
+ echo Adding $real_user to the video group...
+ usermod -a -G video $real_user
+ if [[ $? -ne 0 ]]; then
+ echo WARNING: unable to add $real_user to the video group >&2
+ fi
+}
+
+_check_distro_version()
+{
+ if [[ $DISTRO == centos ]]; then
+ if ! grep -q 'CentOS Linux release 7\.' /etc/centos-release; then
+ echo ERROR: this script is supported only on CentOS 7 >&2
+ exit $EXIT_FAILURE
+ fi
+ elif [[ $DISTRO == ubuntu ]]; then
+ grep -q -E "18.04" /etc/lsb-release && UBUNTU_VERSION="18.04"
+ if [[ -z $UBUNTU_VERSION ]]; then
+ echo "Warning: The driver was validated only on Ubuntu 18.04 LTS with stock kernel. \nMore info https://github.com/intel/compute-runtime/releases" >&2
+ if [ ! yes_or_no ]; then
+ echo "Installation of GFX driver interrupted"
+ exit $EXIT_FAILURE
+ fi
+ fi
+ fi
+}
+
+distro_init()
+{
+ if [[ -f /etc/centos-release ]]; then
+ DISTRO="centos"
+ elif [[ -f /etc/lsb-release ]]; then
+ DISTRO="ubuntu"
+ fi
+
+ _check_distro_version
+}
+
+install()
+{
+ uninstall_user_mode
+ install_prerequisites
+ install_user_mode
+ add_user_to_video_group
+}
+
+main()
+{
+ echo "Intel OpenCL graphics driver installer"
+ distro_init
+ check_root_access
+ install
+ summary
+}
+
+[[ "$0" == "$BASH_SOURCE" ]] && main "$@"
--- /dev/null
+<meta http-equiv="REFRESH" content="0;URL=http://docs.openvinotoolkit.org/2019_R1/_docs_install_guides_installing_openvino_linux.html#set-the-environment-variables">
\ No newline at end of file
--- /dev/null
+#!/bin/bash
+
+# Copyright (c) 2018 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+if [ $EUID -ne 0 ]; then
+ echo "ERROR: this script must be run as root to install 3rd party packages." >&2
+ echo "Please try again with \"sudo -E $0\", or as root." >&2
+ exit 1
+fi
+
+params=$@
+
+yes_or_no_ffmpeg() {
+ if [ "$params" == "-y" ]; then
+ return 0
+ fi
+
+ while true; do
+ read -p "Add third-party RPM Fusion repository and install FFmpeg package (y/n): " yn
+ case $yn in
+ [Yy]*) return 0 ;;
+ [Nn]*) return 1 ;;
+ esac
+ done
+}
+
+yes_or_no_gst_bad_ugly() {
+ if [ "$params" == "-y" ]; then
+ return 0
+ fi
+
+ while true; do
+ read -p "Add third-party RPM Epel, Nux, Fusion, Forensics repositories and install dependencies for GStreamer Bad & Ugly Plugins (y/n): " yn
+ case $yn in
+ [Yy]*) return 0 ;;
+ [Nn]*) return 1 ;;
+ esac
+ done
+}
+
+if [ -f /etc/lsb-release ]; then
+ # Ubuntu
+ echo
+ echo "This script installs the following OpenVINO 3rd-party dependencies:"
+ echo " 1. GTK+, FFmpeg and GStreamer libraries used by OpenCV"
+ echo " 2. libusb library required for Myriad plugin for Inference Engine"
+ echo " 3. build dependencies for OpenVINO samples"
+ echo " 4. build dependencies for GStreamer Plugins"
+ echo
+ PKGS=(
+ cpio
+ build-essential
+ cmake
+ libusb-1.0-0-dev
+ libdrm-dev
+ libgstreamer1.0-0
+ gstreamer1.0-plugins-base
+ gstreamer1.0-plugins-good
+ gstreamer1.0-plugins-bad
+ ffmpeg
+ )
+ system_ver=$(cat /etc/lsb-release | grep -i "DISTRIB_RELEASE" | cut -d "=" -f2)
+ if [ "$system_ver" = "16.04" ]; then
+ PKGS+=( libgtk2.0-0 )
+ else
+ PKGS+=( libgtk-3-0
+ libglib2.0-0
+ flex
+ bison
+ libgmp10
+ libgsl23
+ gobject-introspection
+ libcap2
+ libcap2-bin
+ gettext
+ libgirepository-1.0-1
+ libx11-6
+ iso-codes
+ libgl1-mesa-dri
+ libgles2
+ libgl-dev
+ gudev-1.0
+ libtheora0
+ libcdparanoia0
+ libpango-1.0-0
+ libgbm1
+ libasound2
+ libjpeg8
+ libvisual-0.4-0
+ libxv1
+ libopus0
+ libgraphene-1.0-0
+ libvorbis0a
+ libbz2-1.0
+ libv4l-0
+ libaa1
+ libflac8
+ libgdk-pixbuf2.0-0
+ libmp3lame0
+ libcaca0
+ libdv4
+ libmpg123-0
+ libraw1394-11
+ libavc1394-0
+ libiec61883-0
+ libpulse0
+ libsoup2.4-1
+ libspeex1
+ libtag-extras1
+ libtwolame0
+ libwavpack1
+ libbluetooth3
+ libusb-1.0.0-dev
+ libass9
+ libbs2b0
+ libchromaprint1
+ liblcms2-2
+ libssh2-1
+ libdc1394-22
+ libdirectfb-1.7-7
+ libssh-4
+ libdca0
+ libfaac0
+ libfdk-aac1
+ libflite1
+ libfluidsynth1
+ libgme0
+ libgsm1
+ libnettle6
+ libkate1
+ liblrdf0
+ libde265-0
+ libmjpegtools-dev
+ libmms0
+ libmodplug1
+ libmpcdec6
+ libneon27
+ libopenal1
+ libopenexr22
+ libopenjp2-7
+ libopenmpt0
+ libopenni2-0
+ libdvdnav4
+ librtmp1
+ librsvg2-2
+ libsbc1
+ libsndfile1
+ libsoundtouch1
+ libspandsp2
+ libsrtp2-1
+ libzvbi0
+ libvo-aacenc0
+ libvo-amrwbenc0
+ libwebrtc-audio-processing1
+ libwebp6
+ libwildmidi2
+ libzbar0
+ libnice10
+ libxkbcommon0
+ libmpeg2-4
+ libopencore-amrnb0
+ libopencore-amrwb0
+ liba52-0.7.4
+ libva2
+ libxrandr2
+ libudev1
+ python3.6
+ libpython3.6
+ python3-gi
+ )
+ fi
+ apt update
+ apt install -y ${PKGS[@]}
+else
+ # CentOS
+ echo
+ echo "This script installs the following OpenVINO 3rd-party dependencies:"
+ echo " 1. GTK+ and GStreamer libraries used by OpenCV"
+ echo " 2. libusb library required for Myriad plugin for Inference Engine"
+ echo " 3. Python 3.6 for Model Optimizer"
+ echo " 4. gcc 4.8.5 and other build dependencies for OpenVINO samples"
+ echo " 5. build dependencies for GStreamer Plugins"
+ echo
+ PKGS=(
+ libusbx-devel
+ gtk2
+ gstreamer1
+ gstreamer1-plugins-good
+ gstreamer1-plugins-bad-free
+ gcc
+ gcc-c++
+ make
+ glibc-static
+ glibc
+ libstdc++-static
+ libstdc++
+ libstdc++
+ libgcc
+ cmake
+ python36
+ python36-pip
+ glib2-devel
+ flex
+ bison
+ gmp
+ gsl
+ gobject-introspection
+ libcap
+ libcap
+ gettext
+ libXrandr
+ libX11
+ iso-codes
+ mesa-libEGL
+ mesa-libGLES
+ mesa-libGL
+ libgudev1
+ libtheora
+ cdparanoia
+ pango
+ mesa-libgbm
+ alsa-lib
+ libjpeg-turbo
+ libvisual
+ libXv
+ opus
+ libvorbis
+ patch
+ bzip2
+ libv4l
+ flac
+ gdk-pixbuf2
+ libdv
+ mpg123
+ libraw1394
+ libavc1394
+ libiec61883
+ pulseaudio-libs
+ libsoup
+ speex
+ wavpack
+ boost-regex-1.53.0
+ )
+ yum install -y ${PKGS[@]}
+
+ # Thirdparty repositories for installing GStreamer Bad & Ugly Plugins dependencies.
+ if yes_or_no_gst_bad_ugly; then
+ GST_BAD_UGLY_PKGS=(
+ bluez-libs
+ libusb
+ libass
+ libbs2b
+ libchromaprint
+ lcms2
+ libssh2
+ libdc1394
+ libXext
+ libssh
+ libdca
+ faac
+ fdk-aac
+ flite
+ fluidsynth
+ game-music-emu
+ gsm
+ nettle
+ kate
+ liblrdf
+ libde265
+ mjpegtools
+ libmms
+ libmodplug
+ libmpcdec
+ neon
+ openal-soft
+ OpenEXR
+ openjpeg2
+ openni
+ libdvdnav
+ librtmp
+ librsvg2
+ sbc
+ libsndfile
+ soundtouch
+ spandsp
+ libsrtp
+ zvbi
+ vo-amrwbenc
+ webrtc-audio-processing
+ wildmidi
+ zbar
+ libnice
+ libxkbcommon
+ libmpeg2
+ libcdio
+ opencore-amr
+ libva
+ python36-gobject
+ python3-devel
+ )
+ yum install -y epel-release
+ rpm -Uvh https://download1.rpmfusion.org/free/el/rpmfusion-free-release-7.noarch.rpm
+ RPMFUSION_IS_INSTALLED=1
+ yum install -y https://forensics.cert.org/cert-forensics-tools-release-el7.rpm
+ yum install -y ${GST_BAD_UGLY_PKGS[@]}
+ else
+ echo "Dependencies for GStreamer Ugly & Bad plugins installation skipped."
+ echo
+ fi
+
+ echo
+ echo "Intel(R) Distribution of OpenVINO(TM) toolkit can use FFmpeg for processing video streams with OpenCV. Please select your preferred method for installing FFmpeg:"
+ echo
+ echo "Option 1: Allow installer script to add a third party repository, RPM Fusion (https://rpmfusion.org/), which contains FFmpeg. FFmpeg rpm package will be installed from this repository. "
+ echo "WARNING: This repository is NOT PROVIDED OR SUPPORTED by Intel or CentOS. Neither Intel nor CentOS has control over this repository. Terms governing your use of FFmpeg can be found here: https://www.ffmpeg.org/legal.html "
+ echo "Once added, this repository will be enabled on your operating system and can thus receive updates to all packages installed from it. "
+ echo
+ echo "Consider the following ways to prevent unintended 'updates' from this third party repository from over-writing some core part of CentOS:"
+ echo "a) Only enable these archives from time to time, and generally leave them disabled. See: man yum"
+ echo "b) Use the exclude= and includepkgs= options on a per sub-archive basis, in the matching .conf file found in /etc/yum.repos.d/ See: man yum.conf"
+ echo "c) The yum Priorities plug-in can prevent a 3rd party repository from replacing base packages, or prevent base/updates from replacing a 3rd party package."
+ echo
+ echo "Option 2: Skip FFmpeg installation."
+ echo
+
+ if yes_or_no_ffmpeg; then
+ if [[ -z $RPMFUSION_IS_INSTALLED ]]; then
+ yum install -y epel-release
+ rpm -Uvh https://download1.rpmfusion.org/free/el/rpmfusion-free-release-7.noarch.rpm
+ fi
+ yum install -y ffmpeg
+ else
+ echo "FFmpeg installation skipped. You may build FFmpeg from sources as described here: https://trac.ffmpeg.org/wiki/CompilationGuide/Centos"
+ echo
+ fi
+ exit
+fi
--- /dev/null
+@echo off
+
+:: Copyright (c) 2018-2019 Intel Corporation
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+:: http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+
+set ROOT=%~dp0
+call :GetFullPath "%ROOT%\.." ROOT
+set SCRIPT_NAME=%~nx0
+
+set "INTEL_OPENVINO_DIR=%ROOT%"
+set "INTEL_CVSDK_DIR=%INTEL_OPENVINO_DIR%"
+
+where /q libmmd.dll || echo Warning: libmmd.dll couldn't be found in %%PATH%%. Please check if the redistributable package for Intel(R) C++ Compiler is installed and the library path is added to the PATH environment variable. System reboot can be required to update the system environment.
+
+:: OpenCV
+if exist "%INTEL_OPENVINO_DIR%\opencv\setupvars.bat" (
+call "%INTEL_OPENVINO_DIR%\opencv\setupvars.bat"
+) else (
+set "OpenCV_DIR=%INTEL_OPENVINO_DIR%\opencv\x64\vc14\lib"
+set "PATH=%INTEL_OPENVINO_DIR%\opencv\x64\vc14\bin;%PATH%"
+)
+
+:: Model Optimizer
+if exist %INTEL_OPENVINO_DIR%\deployment_tools\model_optimizer (
+set PYTHONPATH=%INTEL_OPENVINO_DIR%\deployment_tools\model_optimizer;%PYTHONPATH%
+set "PATH=%INTEL_OPENVINO_DIR%\deployment_tools\model_optimizer;%PATH%"
+)
+
+:: Inference Engine
+set "InferenceEngine_DIR=%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\share"
+set "HDDL_INSTALL_DIR=%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\external\hddl"
+set "PATH=%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\external\tbb\bin;%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\bin\intel64\Release;%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\bin\intel64\Debug;%HDDL_INSTALL_DIR%\bin;%PATH%"
+if exist %INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\bin\intel64\arch_descriptions (
+set ARCH_ROOT_DIR=%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\bin\intel64\arch_descriptions
+)
+
+:: nGraph
+if exist %INTEL_OPENVINO_DIR%\deployment_tools\ngraph (
+set "PATH=%INTEL_OPENVINO_DIR%\deployment_tools\ngraph\lib;%PATH%"
+set "ngraph_DIR=%INTEL_OPENVINO_DIR%\deployment_tools\ngraph\cmake"
+)
+
+:: Check if Python is installed
+python --version 2>NUL
+if errorlevel 1 (
+ echo Error^: Python is not installed. Please install Python 3.5. or 3.6 ^(64-bit^) from https://www.python.org/downloads/
+ exit /B 1
+)
+
+:: Check Python version
+for /F "tokens=* USEBACKQ" %%F IN (`python --version 2^>^&1`) DO (
+ set version=%%F
+)
+
+for /F "tokens=1,2,3 delims=. " %%a in ("%version%") do (
+ set Major=%%b
+ set Minor=%%c
+)
+
+if "%Major%" geq "3" (
+ if "%Minor%" geq "5" (
+ set python_ver=okay
+ )
+)
+
+if not "%python_ver%"=="okay" (
+ echo Unsupported Python version. Please install Python 3.5 or 3.6 ^(64-bit^) from https://www.python.org/downloads/
+ exit /B 1
+)
+
+:: Check Python bitness
+python -c "import sys; print(64 if sys.maxsize > 2**32 else 32)" 2 > NUL
+if errorlevel 1 (
+ echo Error^: Error during installed Python bitness detection
+ exit /B 1
+)
+
+for /F "tokens=* USEBACKQ" %%F IN (`python -c "import sys; print(64 if sys.maxsize > 2**32 else 32)" 2^>^&1`) DO (
+ set bitness=%%F
+)
+
+if not "%bitness%"=="64" (
+ echo Unsupported Python bitness. Please install Python 3.5 or 3.6 ^(64-bit^) from https://www.python.org/downloads/
+ exit /B 1
+)
+
+set PYTHONPATH=%INTEL_OPENVINO_DIR%\python\python%Major%.%Minor%;%INTEL_OPENVINO_DIR%\python\python3;%PYTHONPATH%
+
+if exist %INTEL_OPENVINO_DIR%\deployment_tools\open_model_zoo\tools\accuracy_checker (
+ set PYTHONPATH=%INTEL_OPENVINO_DIR%\deployment_tools\open_model_zoo\tools\accuracy_checker;%PYTHONPATH%
+)
+
+echo [setupvars.bat] OpenVINO environment initialized
+
+exit /B 0
+
+:GetFullPath
+SET %2=%~f1
+
+GOTO :EOF
--- /dev/null
+#!/bin/bash
+
+# Copyright (c) 2018-2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INSTALLDIR="${INTEL_OPENVINO_DIR:-<INSTALLDIR>}"
+if [[ ! -d "${INSTALLDIR}" ]]; then
+ # Script has not been processed by installer, so INSTALLDIR is not valid
+ # Using autodetection assuming:
+ # - current shell is "bash"
+ # - location of the current script is in "INSTALLDIR/bin"
+ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+ BASE_DIR="$( dirname "$SCRIPT_DIR" )"
+
+ INSTALLDIR="${BASE_DIR}"
+fi
+
+export INTEL_OPENVINO_DIR="$INSTALLDIR"
+export INTEL_CVSDK_DIR="$INTEL_OPENVINO_DIR"
+
+# parse command line options
+while [[ $# -gt 0 ]]
+do
+key="$1"
+case $key in
+ -pyver)
+ python_version=$2
+ echo python_version = "${python_version}"
+ shift
+ ;;
+ *)
+ # unknown option
+ ;;
+esac
+shift
+done
+
+if [ -e $INSTALLDIR/deployment_tools/inference_engine ]; then
+ export InferenceEngine_DIR=$INTEL_OPENVINO_DIR/deployment_tools/inference_engine/share
+ system_type=$(\ls $INTEL_OPENVINO_DIR/deployment_tools/inference_engine/lib/)
+ IE_PLUGINS_PATH=$INTEL_OPENVINO_DIR/deployment_tools/inference_engine/lib/$system_type
+
+ if [[ -e ${IE_PLUGINS_PATH}/arch_descriptions ]]; then
+ export ARCH_ROOT_DIR=${IE_PLUGINS_PATH}/arch_descriptions
+ fi
+
+ export HDDL_INSTALL_DIR=$INSTALLDIR/deployment_tools/inference_engine/external/hddl
+ if [[ "$OSTYPE" == "darwin"* ]]; then
+ export DYLD_LIBRARY_PATH=$INSTALLDIR/deployment_tools/inference_engine/external/mkltiny_mac/lib:$INSTALLDIR/deployment_tools/inference_engine/external/tbb/lib:$IE_PLUGINS_PATH:$DYLD_LIBRARY_PATH
+ export LD_LIBRARY_PATH=$INSTALLDIR/deployment_tools/inference_engine/external/mkltiny_mac/lib:$INSTALLDIR/deployment_tools/inference_engine/external/tbb/lib:$IE_PLUGINS_PATH:$LD_LIBRARY_PATH
+ else
+ export LD_LIBRARY_PATH=$HDDL_INSTALL_DIR/lib:$INSTALLDIR/deployment_tools/inference_engine/external/gna/lib:$INSTALLDIR/deployment_tools/inference_engine/external/mkltiny_lnx/lib:$INSTALLDIR/deployment_tools/inference_engine/external/tbb/lib:$IE_PLUGINS_PATH:$LD_LIBRARY_PATH
+ fi
+fi
+
+if [ -e $INSTALLDIR/deployment_tools/ngraph ]; then
+ export LD_LIBRARY_PATH=$INSTALLDIR/deployment_tools/ngraph/lib:$LD_LIBRARY_PATH
+ export ngraph_DIR=$INSTALLDIR/deployment_tools/ngraph/cmake
+fi
+
+if [ -e "$INSTALLDIR/opencv" ]; then
+ if [ -f "$INSTALLDIR/opencv/setupvars.sh" ]; then
+ source "$INSTALLDIR/opencv/setupvars.sh"
+ else
+ export OpenCV_DIR="$INSTALLDIR/opencv/share/OpenCV"
+ export LD_LIBRARY_PATH="$INSTALLDIR/opencv/lib:$LD_LIBRARY_PATH"
+ export LD_LIBRARY_PATH="$INSTALLDIR/opencv/share/OpenCV/3rdparty/lib:$LD_LIBRARY_PATH"
+ fi
+fi
+
+
+if [ -f "$INTEL_OPENVINO_DIR/data_processing/dl_streamer/bin/setupvars.sh" ]; then
+ source "$INTEL_OPENVINO_DIR/data_processing/dl_streamer/bin/setupvars.sh"
+fi
+
+export PATH="$INTEL_OPENVINO_DIR/deployment_tools/model_optimizer:$PATH"
+export PYTHONPATH="$INTEL_OPENVINO_DIR/deployment_tools/model_optimizer:$PYTHONPATH"
+
+if [ -e $INTEL_OPENVINO_DIR/deployment_tools/open_model_zoo/tools/accuracy_checker ]; then
+ export PYTHONPATH="$INTEL_OPENVINO_DIR/deployment_tools/open_model_zoo/tools/accuracy_checker:$PYTHONPATH"
+fi
+
+if [ -z "$python_version" ]; then
+ if command -v python3.7 >/dev/null 2>&1; then
+ python_version=3.7
+ python_bitness=$(python3.7 -c 'import sys; print(64 if sys.maxsize > 2**32 else 32)')
+ elif command -v python3.6 >/dev/null 2>&1; then
+ python_version=3.6
+ python_bitness=$(python3.6 -c 'import sys; print(64 if sys.maxsize > 2**32 else 32)')
+ elif command -v python3.5 >/dev/null 2>&1; then
+ python_version=3.5
+ python_bitness=$(python3.5 -c 'import sys; print(64 if sys.maxsize > 2**32 else 32)')
+ elif command -v python3.4 >/dev/null 2>&1; then
+ python_version=3.4
+ python_bitness=$(python3.4 -c 'import sys; print(64 if sys.maxsize > 2**32 else 32)')
+ elif command -v python2.7 >/dev/null 2>&1; then
+ python_version=2.7
+ elif command -v python >/dev/null 2>&1; then
+ python_version=$(python -c 'import sys; print(".".join(map(str, sys.version_info[:2])))')
+ fi
+fi
+
+OS_NAME=""
+if command -v lsb_release >/dev/null 2>&1; then
+ OS_NAME=$(lsb_release -i -s)
+fi
+
+if [ "$python_bitness" != "" ] && [ "$python_bitness" != "64" ] && [ "$OS_NAME" != "Raspbian" ]; then
+ echo "[setupvars.sh] 64 bitness for Python" $python_version "is requred"
+fi
+
+if [ ! -z "$python_version" ]; then
+ if [ "$python_version" != "2.7" ]; then
+ # add path to OpenCV API for Python 3.x
+ export PYTHONPATH="$INTEL_OPENVINO_DIR/python/python3:$PYTHONPATH"
+ fi
+ # add path to Inference Engine Python API
+ export PYTHONPATH="$INTEL_OPENVINO_DIR/python/python$python_version:$PYTHONPATH"
+fi
+
+echo "[setupvars.sh] OpenVINO environment initialized"
--- /dev/null
+import argparse
+import os
+from shutil import rmtree
+
+from utils import Automation
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--build_number", type=int, help="Build number to be added to package version", default=0, )
+args = parser.parse_args()
+
+auto = Automation()
+base_dir = os.path.dirname(__file__)
+bom_path = os.path.join(base_dir, "package_BOM.txt")
+bom = auto.parse_bom(bom_path=bom_path)
+dir_to_tar = auto.copy_files_from_bom(root_path=os.path.join(os.path.dirname(__file__), ".."), bom=bom)
--- /dev/null
+import os
+import subprocess
+import tarfile
+from datetime import datetime
+from shutil import copyfile, copytree, rmtree
+
+major_version = 0
+minor_version = 3
+
+
+class Automation:
+ @staticmethod
+ def parse_bom(bom_path):
+ files = []
+ for file in open(bom_path):
+ files.append(file)
+ return files
+
+ @staticmethod
+ def copy_files_from_bom(root_path, bom):
+ target_dir = os.path.join(os.path.dirname(__file__), "tools_package")
+ if os.path.exists(target_dir):
+ rmtree(target_dir)
+ os.makedirs(target_dir)
+ for file in bom:
+ src = os.path.join(root_path, file.strip('\n'))
+ dst = os.path.join(target_dir, file.strip('\n'))
+ if not os.path.exists(os.path.dirname(dst)):
+ os.makedirs(os.path.dirname(dst))
+ if os.path.isdir(src):
+ copytree(src, dst)
+ else:
+ copyfile(src, dst)
+ return target_dir
+
+ @staticmethod
+ def add_version_txt(dst_path, build_number, git_hash_short):
+ git_hash = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip("\n")
+ if git_hash_short == "0":
+ git_hash_short = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]).decode("utf-8").strip(
+ "\n")
+ verson = "{0}.{1}.{2}.{3}".format(major_version, minor_version, build_number, git_hash_short)
+ timestamp = datetime.now().strftime("%I:%M%p %B %d, %Y")
+ with open(os.path.join(dst_path, "version.txt"), 'w') as f:
+ f.write(timestamp + '\n')
+ f.write(verson + '\n')
+ f.write(git_hash + '\n')
+ return verson
+
+ @staticmethod
+ def make_tarfile(out_file_name, source_dir):
+ archive_path = os.path.join(os.path.dirname(__file__), out_file_name)
+ if os.path.exists(archive_path):
+ os.remove(archive_path)
+ with tarfile.open(out_file_name, "w:gz") as tar:
+ tar.add(source_dir, arcname=os.path.basename(source_dir))
--- /dev/null
+<?xml version="1.0"?>
+<attributes>
+ <irs_path>
+ <value>/nfs/inn/proj/vdp/vdp_tests/stress_tests/master_04d6f112132f92cab563ae7655747e0359687dc9/</value>
+ </irs_path>
+</attributes>
--- /dev/null
+<?xml version="1.0"?>
+<attributes>
+ <models>
+ <model path="caffe/FP32/alexnet/alexnet.xml" test="create_exenetwork" device="CPU" vmsize="753847" vmpeak="1528832" vmrss="14005" vmhwm="814655" />
+ <model path="caffe/FP32/alexnet/alexnet.xml" test="create_exenetwork" device="GPU" vmsize="580025" vmpeak="1743759" vmrss="234704" vmhwm="1462062" />
+ <model path="caffe/FP32/alexnet/alexnet.xml" test="infer_request_inference" device="CPU" vmsize="1339971" vmpeak="1528828" vmrss="555262" vmhwm="814805" />
+ <model path="caffe/FP32/alexnet/alexnet.xml" test="infer_request_inference" device="GPU" vmsize="1389159" vmpeak="1741154" vmrss="1036169" vmhwm="1460052" />
+ <model path="caffe/FP32/caffenet/caffenet.xml" test="create_exenetwork" device="CPU" vmsize="753843" vmpeak="1545451" vmrss="14234" vmhwm="821334" />
+ <model path="caffe/FP32/caffenet/caffenet.xml" test="create_exenetwork" device="GPU" vmsize="602206" vmpeak="1511325" vmrss="257501" vmhwm="1230284" />
+ <model path="caffe/FP32/caffenet/caffenet.xml" test="infer_request_inference" device="CPU" vmsize="1368206" vmpeak="1545456" vmrss="576774" vmhwm="821739" />
+ <model path="caffe/FP32/caffenet/caffenet.xml" test="infer_request_inference" device="GPU" vmsize="1423096" vmpeak="1511373" vmrss="1074752" vmhwm="1230732" />
+ <model path="caffe/FP32/densenet_121/densenet_121.xml" test="create_exenetwork" device="CPU" vmsize="772626" vmpeak="985754" vmrss="95260" vmhwm="151496" />
+ <model path="caffe/FP32/densenet_121/densenet_121.xml" test="create_exenetwork" device="GPU" vmsize="1044604" vmpeak="1154709" vmrss="699168" vmhwm="811104" />
+ <model path="caffe/FP32/densenet_121/densenet_121.xml" test="infer_request_inference" device="CPU" vmsize="985525" vmpeak="1057614" vmrss="159306" vmhwm="159306" />
+ <model path="caffe/FP32/densenet_121/densenet_121.xml" test="infer_request_inference" device="GPU" vmsize="1163289" vmpeak="1235379" vmrss="812961" vmhwm="812961" />
+ <model path="caffe/FP32/densenet_161/densenet_161.xml" test="create_exenetwork" device="CPU" vmsize="762770" vmpeak="1212248" vmrss="93570" vmhwm="426817" />
+ <model path="caffe/FP32/densenet_161/densenet_161.xml" test="create_exenetwork" device="GPU" vmsize="1127847" vmpeak="1586310" vmrss="782029" vmhwm="1304679" />
+ <model path="caffe/FP32/densenet_161/densenet_161.xml" test="infer_request_inference" device="CPU" vmsize="1351816" vmpeak="1423906" vmrss="353738" vmhwm="427644" />
+ <model path="caffe/FP32/densenet_161/densenet_161.xml" test="infer_request_inference" device="GPU" vmsize="1660304" vmpeak="1660304" vmrss="1309215" vmhwm="1309215" />
+ <model path="caffe/FP32/densenet_169/densenet_169.xml" test="create_exenetwork" device="CPU" vmsize="791863" vmpeak="998329" vmrss="123059" vmhwm="240160" />
+ <model path="caffe/FP32/densenet_169/densenet_169.xml" test="create_exenetwork" device="GPU" vmsize="1309598" vmpeak="1428944" vmrss="964066" vmhwm="1086751" />
+ <model path="caffe/FP32/densenet_169/densenet_169.xml" test="infer_request_inference" device="CPU" vmsize="1060303" vmpeak="1132392" vmrss="238924" vmhwm="240416" />
+ <model path="caffe/FP32/densenet_169/densenet_169.xml" test="infer_request_inference" device="GPU" vmsize="1435214" vmpeak="1507303" vmrss="1084969" vmhwm="1084969" />
+ <model path="caffe/FP32/densenet_201/densenet_201.xml" test="create_exenetwork" device="CPU" vmsize="864639" vmpeak="1153900" vmrss="147906" vmhwm="322590" />
+ <model path="caffe/FP32/densenet_201/densenet_201.xml" test="create_exenetwork" device="GPU" vmsize="1541161" vmpeak="1686282" vmrss="1195972" vmhwm="1337595" />
+ <model path="caffe/FP32/densenet_201/densenet_201.xml" test="infer_request_inference" device="CPU" vmsize="1181479" vmpeak="1253568" vmrss="315581" vmhwm="322700" />
+ <model path="caffe/FP32/densenet_201/densenet_201.xml" test="infer_request_inference" device="GPU" vmsize="1706760" vmpeak="1778849" vmrss="1356533" vmhwm="1356533" />
+ <model path="caffe/FP32/dilation/dilation.xml" test="create_exenetwork" device="CPU" vmsize="754428" vmpeak="3004311" vmrss="17613" vmhwm="1856210" />
+ <model path="caffe/FP32/dilation/dilation.xml" test="create_exenetwork" device="GPU" vmsize="710569" vmpeak="3363879" vmrss="365380" vmhwm="3081751" />
+ <model path="caffe/FP32/dilation/dilation.xml" test="infer_request_inference" device="CPU" vmsize="2487130" vmpeak="3004311" vmrss="1687936" vmhwm="1856448" />
+ <model path="caffe/FP32/dilation/dilation.xml" test="infer_request_inference" device="GPU" vmsize="2951748" vmpeak="3363804" vmrss="2597940" vmhwm="3080968" />
+ <model path="caffe/FP32/dpn_92/dpn_92.xml" test="create_exenetwork" device="CPU" vmsize="767157" vmpeak="1369376" vmrss="63338" vmhwm="540166" />
+ <model path="caffe/FP32/dpn_92/dpn_92.xml" test="create_exenetwork" device="GPU" vmsize="1155101" vmpeak="1701180" vmrss="809938" vmhwm="1420152" />
+ <model path="caffe/FP32/dpn_92/dpn_92.xml" test="infer_request_inference" device="CPU" vmsize="1299262" vmpeak="1373882" vmrss="431758" vmhwm="540214" />
+ <model path="caffe/FP32/dpn_92/dpn_92.xml" test="infer_request_inference" device="GPU" vmsize="1647738" vmpeak="1719828" vmrss="1296350" vmhwm="1419092" />
+ <model path="caffe/FP32/fcn_alexnet/fcn_alexnet.xml" test="create_exenetwork" device="CPU" vmsize="753711" vmpeak="1642832" vmrss="14014" vmhwm="789109" />
+ <model path="caffe/FP32/fcn_alexnet/fcn_alexnet.xml" test="create_exenetwork" device="GPU" vmsize="595430" vmpeak="1690484" vmrss="250496" vmhwm="1409205" />
+ <model path="caffe/FP32/fcn_alexnet/fcn_alexnet.xml" test="infer_request_inference" device="CPU" vmsize="1494464" vmpeak="1642832" vmrss="679214" vmhwm="789412" />
+ <model path="caffe/FP32/fcn_alexnet/fcn_alexnet.xml" test="infer_request_inference" device="GPU" vmsize="1450746" vmpeak="1693172" vmrss="1097681" vmhwm="1412254" />
+ <model path="caffe/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="create_exenetwork" device="CPU" vmsize="919740" vmpeak="1521955" vmrss="234520" vmhwm="792022" />
+ <model path="caffe/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="create_exenetwork" device="GPU" vmsize="1666363" vmpeak="2175012" vmrss="1321245" vmhwm="1893936" />
+ <model path="caffe/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="infer_request_inference" device="CPU" vmsize="1436982" vmpeak="1521955" vmrss="643614" vmhwm="793218" />
+ <model path="caffe/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="infer_request_inference" device="GPU" vmsize="2138818" vmpeak="2210907" vmrss="1786162" vmhwm="1893760" />
+ <model path="caffe/FP32/inception_v1/inception_v1.xml" test="create_exenetwork" device="CPU" vmsize="757262" vmpeak="978832" vmrss="81408" vmhwm="124238" />
+ <model path="caffe/FP32/inception_v1/inception_v1.xml" test="create_exenetwork" device="GPU" vmsize="810590" vmpeak="929139" vmrss="464868" vmhwm="503813" />
+ <model path="caffe/FP32/inception_v1/inception_v1.xml" test="infer_request_inference" device="CPU" vmsize="928637" vmpeak="1000727" vmrss="130719" vmhwm="130719" />
+ <model path="caffe/FP32/inception_v1/inception_v1.xml" test="infer_request_inference" device="GPU" vmsize="859478" vmpeak="931568" vmrss="507540" vmhwm="507540" />
+ <model path="caffe/FP32/inception_v2/inception_v2.xml" test="create_exenetwork" device="CPU" vmsize="766726" vmpeak="925245" vmrss="33382" vmhwm="180268" />
+ <model path="caffe/FP32/inception_v2/inception_v2.xml" test="create_exenetwork" device="GPU" vmsize="775117" vmpeak="913347" vmrss="430157" vmhwm="605598" />
+ <model path="caffe/FP32/inception_v2/inception_v2.xml" test="infer_request_inference" device="CPU" vmsize="927163" vmpeak="999253" vmrss="141869" vmhwm="181156" />
+ <model path="caffe/FP32/inception_v2/inception_v2.xml" test="infer_request_inference" device="GPU" vmsize="924752" vmpeak="996842" vmrss="571590" vmhwm="602839" />
+ <model path="caffe/FP32/inception_v3/inception_v3.xml" test="create_exenetwork" device="CPU" vmsize="767003" vmpeak="1090526" vmrss="34900" vmhwm="348172" />
+ <model path="caffe/FP32/inception_v3/inception_v3.xml" test="create_exenetwork" device="GPU" vmsize="948046" vmpeak="1182082" vmrss="602624" vmhwm="900169" />
+ <model path="caffe/FP32/inception_v3/inception_v3.xml" test="infer_request_inference" device="CPU" vmsize="1051481" vmpeak="1123570" vmrss="257219" vmhwm="348541" />
+ <model path="caffe/FP32/inception_v3/inception_v3.xml" test="infer_request_inference" device="GPU" vmsize="1187106" vmpeak="1259196" vmrss="834438" vmhwm="902800" />
+ <model path="caffe/FP32/inception_v4/inception_v4.xml" test="create_exenetwork" device="CPU" vmsize="764315" vmpeak="1326938" vmrss="63725" vmhwm="603213" />
+ <model path="caffe/FP32/inception_v4/inception_v4.xml" test="create_exenetwork" device="GPU" vmsize="1183410" vmpeak="1680448" vmrss="837953" vmhwm="1398870" />
+ <model path="caffe/FP32/inception_v4/inception_v4.xml" test="infer_request_inference" device="CPU" vmsize="1227798" vmpeak="1326908" vmrss="438160" vmhwm="602434" />
+ <model path="caffe/FP32/inception_v4/inception_v4.xml" test="infer_request_inference" device="GPU" vmsize="1633997" vmpeak="1706086" vmrss="1281693" vmhwm="1395878" />
+ <model path="caffe/FP32/lenet/lenet.xml" test="create_exenetwork" device="CPU" vmsize="753605" vmpeak="876330" vmrss="15571" vmhwm="29106" />
+ <model path="caffe/FP32/lenet/lenet.xml" test="create_exenetwork" device="GPU" vmsize="566693" vmpeak="658486" vmrss="220783" vmhwm="232452" />
+ <model path="caffe/FP32/lenet/lenet.xml" test="infer_request_inference" device="CPU" vmsize="808486" vmpeak="880576" vmrss="29084" vmhwm="29084" />
+ <model path="caffe/FP32/lenet/lenet.xml" test="infer_request_inference" device="GPU" vmsize="586401" vmpeak="658490" vmrss="232764" vmhwm="232764" />
+ <model path="caffe/FP32/mobilenet/mobilenet.xml" test="create_exenetwork" device="CPU" vmsize="754864" vmpeak="893692" vmrss="54617" vmhwm="81584" />
+ <model path="caffe/FP32/mobilenet/mobilenet.xml" test="create_exenetwork" device="GPU" vmsize="642527" vmpeak="750424" vmrss="296678" vmhwm="362300" />
+ <model path="caffe/FP32/mobilenet/mobilenet.xml" test="infer_request_inference" device="CPU" vmsize="831336" vmpeak="903425" vmrss="85654" vmhwm="85654" />
+ <model path="caffe/FP32/mobilenet/mobilenet.xml" test="infer_request_inference" device="GPU" vmsize="716047" vmpeak="788136" vmrss="364434" vmhwm="364434" />
+ <model path="caffe/FP32/mobilenet_v2/mobilenet_v2.xml" test="create_exenetwork" device="CPU" vmsize="756813" vmpeak="819698" vmrss="54410" vmhwm="78289" />
+ <model path="caffe/FP32/mobilenet_v2/mobilenet_v2.xml" test="create_exenetwork" device="GPU" vmsize="758705" vmpeak="862466" vmrss="412966" vmhwm="437131" />
+ <model path="caffe/FP32/mobilenet_v2/mobilenet_v2.xml" test="infer_request_inference" device="CPU" vmsize="840967" vmpeak="840967" vmrss="82860" vmhwm="82860" />
+ <model path="caffe/FP32/mobilenet_v2/mobilenet_v2.xml" test="infer_request_inference" device="GPU" vmsize="787182" vmpeak="859271" vmrss="436801" vmhwm="436801" />
+ <model path="caffe/FP32/mtcnn_o/mtcnn_o.xml" test="create_exenetwork" device="CPU" vmsize="753715" vmpeak="876299" vmrss="17512" vmhwm="28402" />
+ <model path="caffe/FP32/mtcnn_o/mtcnn_o.xml" test="create_exenetwork" device="GPU" vmsize="583092" vmpeak="674744" vmrss="238220" vmhwm="249722" />
+ <model path="caffe/FP32/mtcnn_o/mtcnn_o.xml" test="infer_request_inference" device="CPU" vmsize="808209" vmpeak="808209" vmrss="27865" vmhwm="27865" />
+ <model path="caffe/FP32/mtcnn_o/mtcnn_o.xml" test="infer_request_inference" device="GPU" vmsize="600714" vmpeak="672804" vmrss="246967" vmhwm="246967" />
+ <model path="caffe/FP32/mtcnn_p/mtcnn_p.xml" test="create_exenetwork" device="CPU" vmsize="763677" vmpeak="874535" vmrss="13318" vmhwm="35327" />
+ <model path="caffe/FP32/mtcnn_p/mtcnn_p.xml" test="create_exenetwork" device="GPU" vmsize="570521" vmpeak="662182" vmrss="224774" vmhwm="351410" />
+ <model path="caffe/FP32/mtcnn_p/mtcnn_p.xml" test="infer_request_inference" device="CPU" vmsize="901260" vmpeak="973350" vmrss="108037" vmhwm="108037" />
+ <model path="caffe/FP32/mtcnn_p/mtcnn_p.xml" test="infer_request_inference" device="GPU" vmsize="685115" vmpeak="757204" vmrss="331421" vmhwm="351529" />
+ <model path="caffe/FP32/mtcnn_r/mtcnn_r.xml" test="create_exenetwork" device="CPU" vmsize="753711" vmpeak="803228" vmrss="14806" vmhwm="25911" />
+ <model path="caffe/FP32/mtcnn_r/mtcnn_r.xml" test="create_exenetwork" device="GPU" vmsize="577280" vmpeak="667673" vmrss="232029" vmhwm="242580" />
+ <model path="caffe/FP32/mtcnn_r/mtcnn_r.xml" test="infer_request_inference" device="CPU" vmsize="806102" vmpeak="806102" vmrss="25352" vmhwm="25352" />
+ <model path="caffe/FP32/mtcnn_r/mtcnn_r.xml" test="infer_request_inference" device="GPU" vmsize="593340" vmpeak="665429" vmrss="240200" vmhwm="240200" />
+ <model path="caffe/FP32/openpose_face/openpose_face.xml" test="create_exenetwork" device="CPU" vmsize="764711" vmpeak="1279238" vmrss="23544" vmhwm="528431" />
+ <model path="caffe/FP32/openpose_face/openpose_face.xml" test="create_exenetwork" device="GPU" vmsize="890428" vmpeak="1316884" vmrss="544882" vmhwm="1035192" />
+ <model path="caffe/FP32/openpose_face/openpose_face.xml" test="infer_request_inference" device="CPU" vmsize="1187529" vmpeak="1279207" vmrss="398512" vmhwm="528730" />
+ <model path="caffe/FP32/openpose_face/openpose_face.xml" test="infer_request_inference" device="GPU" vmsize="1288707" vmpeak="1360796" vmrss="935778" vmhwm="1038888" />
+ <model path="caffe/FP32/openpose_hand/openpose_hand.xml" test="create_exenetwork" device="CPU" vmsize="755634" vmpeak="1259024" vmrss="23342" vmhwm="507980" />
+ <model path="caffe/FP32/openpose_hand/openpose_hand.xml" test="create_exenetwork" device="GPU" vmsize="845886" vmpeak="1297898" vmrss="500957" vmhwm="1016822" />
+ <model path="caffe/FP32/openpose_hand/openpose_hand.xml" test="infer_request_inference" device="CPU" vmsize="1327246" vmpeak="1327246" vmrss="384634" vmhwm="507522" />
+ <model path="caffe/FP32/openpose_hand/openpose_hand.xml" test="infer_request_inference" device="GPU" vmsize="1277117" vmpeak="1300490" vmrss="923674" vmhwm="1018956" />
+ <model path="caffe/FP32/openpose_pose_coco/openpose_pose_coco.xml" test="create_exenetwork" device="CPU" vmsize="757556" vmpeak="1471373" vmrss="32780" vmhwm="716861" />
+ <model path="caffe/FP32/openpose_pose_coco/openpose_pose_coco.xml" test="create_exenetwork" device="GPU" vmsize="1153103" vmpeak="1684306" vmrss="807426" vmhwm="1402513" />
+ <model path="caffe/FP32/openpose_pose_coco/openpose_pose_coco.xml" test="infer_request_inference" device="CPU" vmsize="1397686" vmpeak="1471373" vmrss="528620" vmhwm="717728" />
+ <model path="caffe/FP32/openpose_pose_coco/openpose_pose_coco.xml" test="infer_request_inference" device="GPU" vmsize="1597785" vmpeak="1680465" vmrss="1244672" vmhwm="1399217" />
+ <model path="caffe/FP32/places205_alexnet/places205_alexnet.xml" test="create_exenetwork" device="CPU" vmsize="753711" vmpeak="1485853" vmrss="14330" vmhwm="773766" />
+ <model path="caffe/FP32/places205_alexnet/places205_alexnet.xml" test="create_exenetwork" device="GPU" vmsize="604573" vmpeak="1684861" vmrss="259556" vmhwm="1403600" />
+ <model path="caffe/FP32/places205_alexnet/places205_alexnet.xml" test="infer_request_inference" device="CPU" vmsize="1311107" vmpeak="1485862" vmrss="528448" vmhwm="773656" />
+ <model path="caffe/FP32/places205_alexnet/places205_alexnet.xml" test="infer_request_inference" device="GPU" vmsize="1346840" vmpeak="1684896" vmrss="993942" vmhwm="1403886" />
+ <model path="caffe/FP32/places205_googlenet/places205_googlenet.xml" test="create_exenetwork" device="CPU" vmsize="757187" vmpeak="831362" vmrss="78795" vmhwm="113814" />
+ <model path="caffe/FP32/places205_googlenet/places205_googlenet.xml" test="create_exenetwork" device="GPU" vmsize="805270" vmpeak="920321" vmrss="460319" vmhwm="495638" />
+ <model path="caffe/FP32/places205_googlenet/places205_googlenet.xml" test="infer_request_inference" device="CPU" vmsize="852781" vmpeak="852781" vmrss="119033" vmhwm="119033" />
+ <model path="caffe/FP32/places205_googlenet/places205_googlenet.xml" test="infer_request_inference" device="GPU" vmsize="847052" vmpeak="919142" vmrss="494916" vmhwm="494916" />
+ <model path="caffe/FP32/resnet_18/resnet_18.xml" test="create_exenetwork" device="CPU" vmsize="754248" vmpeak="925443" vmrss="16878" vmhwm="177663" />
+ <model path="caffe/FP32/resnet_18/resnet_18.xml" test="create_exenetwork" device="GPU" vmsize="657659" vmpeak="799510" vmrss="312070" vmhwm="466153" />
+ <model path="caffe/FP32/resnet_18/resnet_18.xml" test="infer_request_inference" device="CPU" vmsize="920163" vmpeak="920163" vmrss="131859" vmhwm="176726" />
+ <model path="caffe/FP32/resnet_18/resnet_18.xml" test="infer_request_inference" device="GPU" vmsize="775350" vmpeak="847440" vmrss="422919" vmhwm="467610" />
+ <model path="caffe/FP32/resnet_v1_101/resnet_v1_101.xml" test="create_exenetwork" device="CPU" vmsize="760584" vmpeak="1338202" vmrss="43243" vmhwm="616928" />
+ <model path="caffe/FP32/resnet_v1_101/resnet_v1_101.xml" test="create_exenetwork" device="GPU" vmsize="1104862" vmpeak="1557006" vmrss="759030" vmhwm="1275071" />
+ <model path="caffe/FP32/resnet_v1_101/resnet_v1_101.xml" test="infer_request_inference" device="CPU" vmsize="1224172" vmpeak="1338172" vmrss="434944" vmhwm="616849" />
+ <model path="caffe/FP32/resnet_v1_101/resnet_v1_101.xml" test="infer_request_inference" device="GPU" vmsize="1452145" vmpeak="1558106" vmrss="1099428" vmhwm="1276787" />
+ <model path="caffe/FP32/resnet_v1_152/resnet_v1_152.xml" test="create_exenetwork" device="CPU" vmsize="764878" vmpeak="1551919" vmrss="58638" vmhwm="828383" />
+ <model path="caffe/FP32/resnet_v1_152/resnet_v1_152.xml" test="create_exenetwork" device="GPU" vmsize="1315120" vmpeak="1977250" vmrss="968858" vmhwm="1694796" />
+ <model path="caffe/FP32/resnet_v1_152/resnet_v1_152.xml" test="infer_request_inference" device="CPU" vmsize="1526166" vmpeak="1598256" vmrss="582401" vmhwm="829598" />
+ <model path="caffe/FP32/resnet_v1_152/resnet_v1_152.xml" test="infer_request_inference" device="GPU" vmsize="1804748" vmpeak="1975855" vmrss="1451397" vmhwm="1693419" />
+ <model path="caffe/FP32/resnet_v1_269/resnet_v1_269.xml" test="create_exenetwork" device="CPU" vmsize="927665" vmpeak="2236845" vmrss="224034" vmhwm="1396458" />
+ <model path="caffe/FP32/resnet_v1_269/resnet_v1_269.xml" test="create_exenetwork" device="GPU" vmsize="1988676" vmpeak="3156291" vmrss="1643919" vmhwm="2874946" />
+ <model path="caffe/FP32/resnet_v1_269/resnet_v1_269.xml" test="infer_request_inference" device="CPU" vmsize="2016999" vmpeak="2236955" vmrss="1117754" vmhwm="1396128" />
+ <model path="caffe/FP32/resnet_v1_269/resnet_v1_269.xml" test="infer_request_inference" device="GPU" vmsize="2845849" vmpeak="3165219" vmrss="2493550" vmhwm="2883091" />
+ <model path="caffe/FP32/resnet_v1_50/resnet_v1_50.xml" test="create_exenetwork" device="CPU" vmsize="766101" vmpeak="1079971" vmrss="27359" vmhwm="362142" />
+ <model path="caffe/FP32/resnet_v1_50/resnet_v1_50.xml" test="create_exenetwork" device="GPU" vmsize="834856" vmpeak="1080094" vmrss="490089" vmhwm="799312" />
+ <model path="caffe/FP32/resnet_v1_50/resnet_v1_50.xml" test="infer_request_inference" device="CPU" vmsize="1046381" vmpeak="1118471" vmrss="260528" vmhwm="362203" />
+ <model path="caffe/FP32/resnet_v1_50/resnet_v1_50.xml" test="infer_request_inference" device="GPU" vmsize="1060109" vmpeak="1132199" vmrss="707876" vmhwm="804108" />
+ <model path="caffe/FP32/se_bn_inception/se_bn_inception.xml" test="create_exenetwork" device="CPU" vmsize="758516" vmpeak="930397" vmrss="40572" vmhwm="194062" />
+ <model path="caffe/FP32/se_bn_inception/se_bn_inception.xml" test="create_exenetwork" device="GPU" vmsize="873061" vmpeak="1013430" vmrss="528167" vmhwm="692564" />
+ <model path="caffe/FP32/se_bn_inception/se_bn_inception.xml" test="infer_request_inference" device="CPU" vmsize="957620" vmpeak="1029710" vmrss="152754" vmhwm="194656" />
+ <model path="caffe/FP32/se_bn_inception/se_bn_inception.xml" test="infer_request_inference" device="GPU" vmsize="1014305" vmpeak="1086395" vmrss="662525" vmhwm="694821" />
+ <model path="caffe/FP32/se_resnext_50/se_resnext_50.xml" test="create_exenetwork" device="CPU" vmsize="759382" vmpeak="1174707" vmrss="39265" vmhwm="401856" />
+ <model path="caffe/FP32/se_resnext_50/se_resnext_50.xml" test="create_exenetwork" device="GPU" vmsize="983083" vmpeak="1257471" vmrss="637335" vmhwm="975444" />
+ <model path="caffe/FP32/se_resnext_50/se_resnext_50.xml" test="infer_request_inference" device="CPU" vmsize="1140730" vmpeak="1174672" vmrss="315977" vmhwm="401508" />
+ <model path="caffe/FP32/se_resnext_50/se_resnext_50.xml" test="infer_request_inference" device="GPU" vmsize="1251214" vmpeak="1323304" vmrss="899034" vmhwm="976474" />
+ <model path="caffe/FP32/squeezenet_v1.0/squeezenet_v1.0.xml" test="create_exenetwork" device="CPU" vmsize="754890" vmpeak="815095" vmrss="28833" vmhwm="43881" />
+ <model path="caffe/FP32/squeezenet_v1.0/squeezenet_v1.0.xml" test="create_exenetwork" device="GPU" vmsize="651974" vmpeak="746719" vmrss="306455" vmhwm="321345" />
+ <model path="caffe/FP32/squeezenet_v1.0/squeezenet_v1.0.xml" test="infer_request_inference" device="CPU" vmsize="824942" vmpeak="897032" vmrss="48567" vmhwm="48567" />
+ <model path="caffe/FP32/squeezenet_v1.0/squeezenet_v1.0.xml" test="infer_request_inference" device="GPU" vmsize="676328" vmpeak="748418" vmrss="324860" vmhwm="324860" />
+ <model path="caffe/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="create_exenetwork" device="CPU" vmsize="758212" vmpeak="813208" vmrss="29691" vmhwm="44220" />
+ <model path="caffe/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="create_exenetwork" device="GPU" vmsize="611789" vmpeak="706534" vmrss="266244" vmhwm="324007" />
+ <model path="caffe/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="infer_request_inference" device="CPU" vmsize="818549" vmpeak="890639" vmrss="47141" vmhwm="47141" />
+ <model path="caffe/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="infer_request_inference" device="GPU" vmsize="677705" vmpeak="749795" vmrss="326163" vmhwm="326163" />
+ <model path="caffe/FP32/ssd_googlenet/ssd_googlenet.xml" test="create_exenetwork" device="CPU" vmsize="757534" vmpeak="911495" vmrss="36445" vmhwm="182050" />
+ <model path="caffe/FP32/ssd_googlenet/ssd_googlenet.xml" test="create_exenetwork" device="GPU" vmsize="835683" vmpeak="973280" vmrss="490613" vmhwm="658640" />
+ <model path="caffe/FP32/ssd_googlenet/ssd_googlenet.xml" test="infer_request_inference" device="CPU" vmsize="941076" vmpeak="1013166" vmrss="148222" vmhwm="183185" />
+ <model path="caffe/FP32/ssd_googlenet/ssd_googlenet.xml" test="infer_request_inference" device="GPU" vmsize="989608" vmpeak="1061698" vmrss="637709" vmhwm="661746" />
+ <model path="caffe/FP32/ssd_mobilenet/ssd_mobilenet.xml" test="create_exenetwork" device="CPU" vmsize="757174" vmpeak="901648" vmrss="73409" vmhwm="106537" />
+ <model path="caffe/FP32/ssd_mobilenet/ssd_mobilenet.xml" test="create_exenetwork" device="GPU" vmsize="801644" vmpeak="915186" vmrss="456517" vmhwm="490520" />
+ <model path="caffe/FP32/ssd_mobilenet/ssd_mobilenet.xml" test="infer_request_inference" device="CPU" vmsize="847932" vmpeak="847932" vmrss="116410" vmhwm="116410" />
+ <model path="caffe/FP32/ssd_mobilenet/ssd_mobilenet.xml" test="infer_request_inference" device="GPU" vmsize="843022" vmpeak="915112" vmrss="490864" vmhwm="490864" />
+ <model path="caffe/FP32/ssd_squeezenet/ssd_squeezenet.xml" test="create_exenetwork" device="CPU" vmsize="765393" vmpeak="900402" vmrss="71544" vmhwm="105032" />
+ <model path="caffe/FP32/ssd_squeezenet/ssd_squeezenet.xml" test="create_exenetwork" device="GPU" vmsize="759668" vmpeak="872762" vmrss="414493" vmhwm="497701" />
+ <model path="caffe/FP32/ssd_squeezenet/ssd_squeezenet.xml" test="infer_request_inference" device="CPU" vmsize="848438" vmpeak="900754" vmrss="113590" vmhwm="113590" />
+ <model path="caffe/FP32/ssd_squeezenet/ssd_squeezenet.xml" test="infer_request_inference" device="GPU" vmsize="847620" vmpeak="919710" vmrss="495730" vmhwm="495730" />
+ <model path="caffe/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="create_exenetwork" device="CPU" vmsize="755374" vmpeak="1146156" vmrss="22026" vmhwm="370176" />
+ <model path="caffe/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="create_exenetwork" device="GPU" vmsize="768451" vmpeak="1074730" vmrss="423662" vmhwm="794266" />
+ <model path="caffe/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="infer_request_inference" device="CPU" vmsize="1113609" vmpeak="1185698" vmrss="313513" vmhwm="370035" />
+ <model path="caffe/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="infer_request_inference" device="GPU" vmsize="1134227" vmpeak="1206317" vmrss="783006" vmhwm="795000" />
+ <model path="caffe/FP32/ssd_vgg16_512/ssd_vgg16_512.xml" test="create_exenetwork" device="CPU" vmsize="755796" vmpeak="1267802" vmrss="23746" vmhwm="383983" />
+ <model path="caffe/FP32/ssd_vgg16_512/ssd_vgg16_512.xml" test="create_exenetwork" device="GPU" vmsize="794565" vmpeak="1272634" vmrss="449394" vmhwm="991632" />
+ <model path="caffe/FP32/ssd_vgg16_512/ssd_vgg16_512.xml" test="infer_request_inference" device="CPU" vmsize="1234050" vmpeak="1306140" vmrss="421194" vmhwm="421194" />
+ <model path="caffe/FP32/ssd_vgg16_512/ssd_vgg16_512.xml" test="infer_request_inference" device="GPU" vmsize="1348960" vmpeak="1421050" vmrss="999050" vmhwm="999050" />
+ <model path="caffe/FP32/vgg16/vgg16.xml" test="create_exenetwork" device="CPU" vmsize="754006" vmpeak="2548497" vmrss="15598" vmhwm="1808624" />
+ <model path="caffe/FP32/vgg16/vgg16.xml" test="create_exenetwork" device="GPU" vmsize="668602" vmpeak="3326708" vmrss="323791" vmhwm="3045328" />
+ <model path="caffe/FP32/vgg16/vgg16.xml" test="infer_request_inference" device="CPU" vmsize="2027181" vmpeak="2548497" vmrss="1242560" vmhwm="1808730" />
+ <model path="caffe/FP32/vgg16/vgg16.xml" test="infer_request_inference" device="GPU" vmsize="2441076" vmpeak="3326708" vmrss="2088055" vmhwm="3045050" />
+ <model path="caffe/FP32/vgg19/vgg19.xml" test="create_exenetwork" device="CPU" vmsize="754212" vmpeak="2618030" vmrss="15510" vmhwm="1877383" />
+ <model path="caffe/FP32/vgg19/vgg19.xml" test="create_exenetwork" device="GPU" vmsize="739222" vmpeak="3397112" vmrss="393866" vmhwm="3115085" />
+ <model path="caffe/FP32/vgg19/vgg19.xml" test="infer_request_inference" device="CPU" vmsize="2073794" vmpeak="2618030" vmrss="1289741" vmhwm="1878289" />
+ <model path="caffe/FP32/vgg19/vgg19.xml" test="infer_request_inference" device="GPU" vmsize="2518340" vmpeak="3397081" vmrss="2165196" vmhwm="3114975" />
+ <model path="caffe/FP32/vnect/vnect.xml" test="create_exenetwork" device="CPU" vmsize="764940" vmpeak="947157" vmrss="27988" vmhwm="223726" />
+ <model path="caffe/FP32/vnect/vnect.xml" test="create_exenetwork" device="GPU" vmsize="789223" vmpeak="941683" vmrss="443788" vmhwm="641476" />
+ <model path="caffe/FP32/vnect/vnect.xml" test="infer_request_inference" device="CPU" vmsize="962187" vmpeak="1034277" vmrss="177848" vmhwm="224180" />
+ <model path="caffe/FP32/vnect/vnect.xml" test="infer_request_inference" device="GPU" vmsize="969069" vmpeak="1041158" vmrss="616990" vmhwm="641977" />
+ <model path="caffe/FP32/wrn_50_2/wrn_50_2.xml" test="create_exenetwork" device="CPU" vmsize="755651" vmpeak="1654985" vmrss="24921" vmhwm="920400" />
+ <model path="caffe/FP32/wrn_50_2/wrn_50_2.xml" test="create_exenetwork" device="GPU" vmsize="936892" vmpeak="1838610" vmrss="590994" vmhwm="1556526" />
+ <model path="caffe/FP32/wrn_50_2/wrn_50_2.xml" test="infer_request_inference" device="CPU" vmsize="1433352" vmpeak="1654989" vmrss="639456" vmhwm="918693" />
+ <model path="caffe/FP32/wrn_50_2/wrn_50_2.xml" test="infer_request_inference" device="GPU" vmsize="1613176" vmpeak="1824922" vmrss="1259940" vmhwm="1543031" />
+ <model path="caffe/FP32/yolo_v1_full/yolo_v1_full.xml" test="create_exenetwork" device="CPU" vmsize="754692" vmpeak="4259393" vmrss="18013" vmhwm="3532412" />
+ <model path="caffe/FP32/yolo_v1_full/yolo_v1_full.xml" test="create_exenetwork" device="GPU" vmsize="719105" vmpeak="5906194" vmrss="373648" vmhwm="5623600" />
+ <model path="caffe/FP32/yolo_v1_full/yolo_v1_full.xml" test="infer_request_inference" device="CPU" vmsize="3167040" vmpeak="4259380" vmrss="2378362" vmhwm="3531237" />
+ <model path="caffe/FP32/yolo_v1_full/yolo_v1_full.xml" test="infer_request_inference" device="GPU" vmsize="4165801" vmpeak="5903801" vmrss="3812393" vmhwm="5621585" />
+ <model path="caffe/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="create_exenetwork" device="CPU" vmsize="753860" vmpeak="1101161" vmrss="14599" vmhwm="375399" />
+ <model path="caffe/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="create_exenetwork" device="GPU" vmsize="577640" vmpeak="1037480" vmrss="232443" vmhwm="755972" />
+ <model path="caffe/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="infer_request_inference" device="CPU" vmsize="1059828" vmpeak="1131917" vmrss="272879" vmhwm="374721" />
+ <model path="caffe/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="infer_request_inference" device="GPU" vmsize="957453" vmpeak="1037445" vmrss="605026" vmhwm="756606" />
+ <model path="caffe/FP32/yolo_v2/yolo_v2.xml" test="create_exenetwork" device="CPU" vmsize="754344" vmpeak="1422647" vmrss="16790" vmhwm="680072" />
+ <model path="caffe/FP32/yolo_v2/yolo_v2.xml" test="create_exenetwork" device="GPU" vmsize="678964" vmpeak="1435790" vmrss="334017" vmhwm="1154573" />
+ <model path="caffe/FP32/yolo_v2/yolo_v2.xml" test="infer_request_inference" device="CPU" vmsize="1279823" vmpeak="1422647" vmrss="490692" vmhwm="680526" />
+ <model path="caffe/FP32/yolo_v2/yolo_v2.xml" test="infer_request_inference" device="GPU" vmsize="1325156" vmpeak="1438571" vmrss="972140" vmhwm="1157138" />
+ <model path="caffe/FP32/yolo_v2_tiny/yolo_v2_tiny.xml" test="create_exenetwork" device="CPU" vmsize="753733" vmpeak="954430" vmrss="14278" vmhwm="229913" />
+ <model path="caffe/FP32/yolo_v2_tiny/yolo_v2_tiny.xml" test="create_exenetwork" device="GPU" vmsize="568880" vmpeak="814976" vmrss="223907" vmhwm="533808" />
+ <model path="caffe/FP32/yolo_v2_tiny/yolo_v2_tiny.xml" test="infer_request_inference" device="CPU" vmsize="1032882" vmpeak="1032882" vmrss="174631" vmhwm="230243" />
+ <model path="caffe/FP32/yolo_v2_tiny/yolo_v2_tiny.xml" test="infer_request_inference" device="GPU" vmsize="810031" vmpeak="816178" vmrss="456856" vmhwm="534503" />
+ <model path="caffe/FP32/yolo_v3/yolo_v3.xml" test="create_exenetwork" device="CPU" vmsize="756852" vmpeak="1587154" vmrss="31460" vmhwm="837570" />
+ <model path="caffe/FP32/yolo_v3/yolo_v3.xml" test="create_exenetwork" device="GPU" vmsize="1159840" vmpeak="1822444" vmrss="813969" vmhwm="1540343" />
+ <model path="caffe/FP32/yolo_v3/yolo_v3.xml" test="infer_request_inference" device="CPU" vmsize="1554462" vmpeak="1626552" vmrss="609677" vmhwm="836655" />
+ <model path="caffe/FP32/yolo_v3/yolo_v3.xml" test="infer_request_inference" device="GPU" vmsize="1735610" vmpeak="1821749" vmrss="1383285" vmhwm="1540598" />
+ <model path="mxnet/FP32/caffenet/caffenet.xml" test="create_exenetwork" device="CPU" vmsize="753856" vmpeak="1528538" vmrss="14414" vmhwm="815491" />
+ <model path="mxnet/FP32/caffenet/caffenet.xml" test="create_exenetwork" device="GPU" vmsize="580030" vmpeak="1741062" vmrss="235624" vmhwm="1460386" />
+ <model path="mxnet/FP32/caffenet/caffenet.xml" test="infer_request_inference" device="CPU" vmsize="1339681" vmpeak="1528538" vmrss="556146" vmhwm="815262" />
+ <model path="mxnet/FP32/caffenet/caffenet.xml" test="infer_request_inference" device="GPU" vmsize="1389097" vmpeak="1741093" vmrss="1036178" vmhwm="1460060" />
+ <model path="mxnet/FP32/densenet_121/densenet_121.xml" test="create_exenetwork" device="CPU" vmsize="772622" vmpeak="985749" vmrss="95431" vmhwm="151087" />
+ <model path="mxnet/FP32/densenet_121/densenet_121.xml" test="create_exenetwork" device="GPU" vmsize="1141962" vmpeak="1252068" vmrss="796734" vmhwm="827217" />
+ <model path="mxnet/FP32/densenet_121/densenet_121.xml" test="infer_request_inference" device="CPU" vmsize="985239" vmpeak="1057328" vmrss="158532" vmhwm="158532" />
+ <model path="mxnet/FP32/densenet_121/densenet_121.xml" test="infer_request_inference" device="GPU" vmsize="1171425" vmpeak="1243514" vmrss="818624" vmhwm="818624" />
+ <model path="mxnet/FP32/densenet_161/densenet_161.xml" test="create_exenetwork" device="CPU" vmsize="762731" vmpeak="1211720" vmrss="93486" vmhwm="426896" />
+ <model path="mxnet/FP32/densenet_161/densenet_161.xml" test="create_exenetwork" device="GPU" vmsize="1312801" vmpeak="1592839" vmrss="967252" vmhwm="1311569" />
+ <model path="mxnet/FP32/densenet_161/densenet_161.xml" test="infer_request_inference" device="CPU" vmsize="1198124" vmpeak="1270214" vmrss="353051" vmhwm="427319" />
+ <model path="mxnet/FP32/densenet_161/densenet_161.xml" test="infer_request_inference" device="GPU" vmsize="1657339" vmpeak="1729428" vmrss="1304820" vmhwm="1304820" />
+ <model path="mxnet/FP32/densenet_169/densenet_169.xml" test="create_exenetwork" device="CPU" vmsize="796360" vmpeak="1002408" vmrss="123094" vmhwm="239945" />
+ <model path="mxnet/FP32/densenet_169/densenet_169.xml" test="create_exenetwork" device="GPU" vmsize="1352916" vmpeak="1472262" vmrss="1007630" vmhwm="1084727" />
+ <model path="mxnet/FP32/densenet_169/densenet_169.xml" test="infer_request_inference" device="CPU" vmsize="1059880" vmpeak="1059880" vmrss="239307" vmhwm="241753" />
+ <model path="mxnet/FP32/densenet_169/densenet_169.xml" test="infer_request_inference" device="GPU" vmsize="1437656" vmpeak="1509745" vmrss="1084828" vmhwm="1084828" />
+ <model path="mxnet/FP32/densenet_201/densenet_201.xml" test="create_exenetwork" device="CPU" vmsize="864635" vmpeak="1154040" vmrss="148830" vmhwm="322528" />
+ <model path="mxnet/FP32/densenet_201/densenet_201.xml" test="create_exenetwork" device="GPU" vmsize="1505042" vmpeak="1650162" vmrss="1159906" vmhwm="1343711" />
+ <model path="mxnet/FP32/densenet_201/densenet_201.xml" test="infer_request_inference" device="CPU" vmsize="1181056" vmpeak="1253146" vmrss="315048" vmhwm="322282" />
+ <model path="mxnet/FP32/densenet_201/densenet_201.xml" test="infer_request_inference" device="GPU" vmsize="1719256" vmpeak="1791345" vmrss="1366767" vmhwm="1366767" />
+ <model path="mxnet/FP32/dpn_92/dpn_92.xml" test="create_exenetwork" device="CPU" vmsize="767976" vmpeak="1370195" vmrss="63456" vmhwm="539897" />
+ <model path="mxnet/FP32/dpn_92/dpn_92.xml" test="create_exenetwork" device="GPU" vmsize="1313452" vmpeak="1701664" vmrss="968145" vmhwm="1420434" />
+ <model path="mxnet/FP32/dpn_92/dpn_92.xml" test="infer_request_inference" device="CPU" vmsize="1295571" vmpeak="1370195" vmrss="430610" vmhwm="539536" />
+ <model path="mxnet/FP32/dpn_92/dpn_92.xml" test="infer_request_inference" device="GPU" vmsize="1651421" vmpeak="1723510" vmrss="1299738" vmhwm="1422326" />
+ <model path="mxnet/FP32/fcn8s_vgg16/fcn8s_vgg16.xml" test="create_exenetwork" device="CPU" vmsize="754212" vmpeak="3124338" vmrss="17362" vmhwm="1770388" />
+ <model path="mxnet/FP32/fcn8s_vgg16/fcn8s_vgg16.xml" test="create_exenetwork" device="GPU" vmsize="669583" vmpeak="3628222" vmrss="324363" vmhwm="3347071" />
+ <model path="mxnet/FP32/fcn8s_vgg16/fcn8s_vgg16.xml" test="infer_request_inference" device="CPU" vmsize="2705824" vmpeak="3124338" vmrss="1906933" vmhwm="1906933" />
+ <model path="mxnet/FP32/fcn8s_vgg16/fcn8s_vgg16.xml" test="infer_request_inference" device="GPU" vmsize="3710449" vmpeak="3782539" vmrss="3356861" vmhwm="3356861" />
+ <model path="mxnet/FP32/full_imagenet_network/full_imagenet_network.xml" test="create_exenetwork" device="CPU" vmsize="756870" vmpeak="1192276" vmrss="32300" vmhwm="470417" />
+ <model path="mxnet/FP32/full_imagenet_network/full_imagenet_network.xml" test="create_exenetwork" device="GPU" vmsize="772970" vmpeak="1363872" vmrss="428054" vmhwm="1079412" />
+ <model path="mxnet/FP32/full_imagenet_network/full_imagenet_network.xml" test="infer_request_inference" device="CPU" vmsize="1123746" vmpeak="1195836" vmrss="335288" vmhwm="470162" />
+ <model path="mxnet/FP32/full_imagenet_network/full_imagenet_network.xml" test="infer_request_inference" device="GPU" vmsize="1219618" vmpeak="1362376" vmrss="875415" vmhwm="1077560" />
+ <model path="mxnet/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="create_exenetwork" device="CPU" vmsize="848157" vmpeak="1522730" vmrss="178424" vmhwm="792470" />
+ <model path="mxnet/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="create_exenetwork" device="GPU" vmsize="1549574" vmpeak="2182501" vmrss="1203804" vmhwm="1900742" />
+ <model path="mxnet/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="infer_request_inference" device="CPU" vmsize="1437730" vmpeak="1522730" vmrss="644402" vmhwm="794024" />
+ <model path="mxnet/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="infer_request_inference" device="GPU" vmsize="2145426" vmpeak="2217516" vmrss="1793162" vmhwm="1899854" />
+ <model path="mxnet/FP32/inception_v3/inception_v3.xml" test="create_exenetwork" device="CPU" vmsize="756584" vmpeak="925636" vmrss="32982" vmhwm="182529" />
+ <model path="mxnet/FP32/inception_v3/inception_v3.xml" test="create_exenetwork" device="GPU" vmsize="769230" vmpeak="907847" vmrss="423874" vmhwm="604982" />
+ <model path="mxnet/FP32/inception_v3/inception_v3.xml" test="infer_request_inference" device="CPU" vmsize="928659" vmpeak="928659" vmrss="142304" vmhwm="182353" />
+ <model path="mxnet/FP32/inception_v3/inception_v3.xml" test="infer_request_inference" device="GPU" vmsize="926103" vmpeak="998192" vmrss="572985" vmhwm="603592" />
+ <model path="mxnet/FP32/inception_v3_no_batchnorm/inception_v3_no_batchnorm.xml" test="create_exenetwork" device="CPU" vmsize="757851" vmpeak="1078682" vmrss="34751" vmhwm="348154" />
+ <model path="mxnet/FP32/inception_v3_no_batchnorm/inception_v3_no_batchnorm.xml" test="create_exenetwork" device="GPU" vmsize="911473" vmpeak="1183102" vmrss="565549" vmhwm="900992" />
+ <model path="mxnet/FP32/inception_v3_no_batchnorm/inception_v3_no_batchnorm.xml" test="infer_request_inference" device="CPU" vmsize="1051652" vmpeak="1123742" vmrss="258231" vmhwm="349131" />
+ <model path="mxnet/FP32/inception_v3_no_batchnorm/inception_v3_no_batchnorm.xml" test="infer_request_inference" device="GPU" vmsize="1182570" vmpeak="1254660" vmrss="829659" vmhwm="899540" />
+ <model path="mxnet/FP32/inception_v4/inception_v4.xml" test="create_exenetwork" device="CPU" vmsize="764319" vmpeak="1327506" vmrss="61375" vmhwm="601048" />
+ <model path="mxnet/FP32/inception_v4/inception_v4.xml" test="create_exenetwork" device="GPU" vmsize="1206559" vmpeak="1676272" vmrss="860362" vmhwm="1393906" />
+ <model path="mxnet/FP32/inception_v4/inception_v4.xml" test="infer_request_inference" device="CPU" vmsize="1228396" vmpeak="1327475" vmrss="441135" vmhwm="603394" />
+ <model path="mxnet/FP32/inception_v4/inception_v4.xml" test="infer_request_inference" device="GPU" vmsize="1637486" vmpeak="1709576" vmrss="1285376" vmhwm="1398377" />
+ <model path="mxnet/FP32/location_net/location_net.xml" test="create_exenetwork" device="CPU" vmsize="761046" vmpeak="1754029" vmrss="43916" vmhwm="1002368" />
+ <model path="mxnet/FP32/location_net/location_net.xml" test="create_exenetwork" device="GPU" vmsize="1026110" vmpeak="2108686" vmrss="680191" vmhwm="1826792" />
+ <model path="mxnet/FP32/location_net/location_net.xml" test="infer_request_inference" device="CPU" vmsize="1512095" vmpeak="1753998" vmrss="701483" vmhwm="1002333" />
+ <model path="mxnet/FP32/location_net/location_net.xml" test="infer_request_inference" device="GPU" vmsize="1880973" vmpeak="2110306" vmrss="1532348" vmhwm="1828952" />
+ <model path="mxnet/FP32/lresnet100e/lresnet100e.xml" test="create_exenetwork" device="CPU" vmsize="759695" vmpeak="1636430" vmrss="38011" vmhwm="883225" />
+ <model path="mxnet/FP32/lresnet100e/lresnet100e.xml" test="create_exenetwork" device="GPU" vmsize="1118880" vmpeak="1994964" vmrss="773102" vmhwm="1713034" />
+ <model path="mxnet/FP32/lresnet100e/lresnet100e.xml" test="infer_request_inference" device="CPU" vmsize="1430871" vmpeak="1636434" vmrss="617078" vmhwm="882886" />
+ <model path="mxnet/FP32/lresnet100e/lresnet100e.xml" test="infer_request_inference" device="GPU" vmsize="1804484" vmpeak="1993530" vmrss="1450724" vmhwm="1711340" />
+ <model path="mxnet/FP32/mobilenet/mobilenet.xml" test="create_exenetwork" device="CPU" vmsize="754872" vmpeak="821893" vmrss="55070" vmhwm="82354" />
+ <model path="mxnet/FP32/mobilenet/mobilenet.xml" test="create_exenetwork" device="GPU" vmsize="626304" vmpeak="734201" vmrss="280918" vmhwm="362925" />
+ <model path="mxnet/FP32/mobilenet/mobilenet.xml" test="infer_request_inference" device="CPU" vmsize="831344" vmpeak="903434" vmrss="86495" vmhwm="86495" />
+ <model path="mxnet/FP32/mobilenet/mobilenet.xml" test="infer_request_inference" device="GPU" vmsize="718357" vmpeak="790446" vmrss="367096" vmhwm="367096" />
+ <model path="mxnet/FP32/mobilenet_v2/mobilenet_v2.xml" test="create_exenetwork" device="CPU" vmsize="756826" vmpeak="819711" vmrss="53961" vmhwm="77206" />
+ <model path="mxnet/FP32/mobilenet_v2/mobilenet_v2.xml" test="create_exenetwork" device="GPU" vmsize="758023" vmpeak="861784" vmrss="412702" vmhwm="436805" />
+ <model path="mxnet/FP32/mobilenet_v2/mobilenet_v2.xml" test="infer_request_inference" device="CPU" vmsize="836470" vmpeak="891765" vmrss="83050" vmhwm="83050" />
+ <model path="mxnet/FP32/mobilenet_v2/mobilenet_v2.xml" test="infer_request_inference" device="GPU" vmsize="788986" vmpeak="861075" vmrss="437646" vmhwm="437646" />
+ <model path="mxnet/FP32/mtcnn_o/mtcnn_o.xml" test="create_exenetwork" device="CPU" vmsize="762731" vmpeak="804491" vmrss="17490" vmhwm="28454" />
+ <model path="mxnet/FP32/mtcnn_o/mtcnn_o.xml" test="create_exenetwork" device="GPU" vmsize="578894" vmpeak="670546" vmrss="233547" vmhwm="245172" />
+ <model path="mxnet/FP32/mtcnn_o/mtcnn_o.xml" test="infer_request_inference" device="CPU" vmsize="808209" vmpeak="808209" vmrss="28314" vmhwm="28314" />
+ <model path="mxnet/FP32/mtcnn_o/mtcnn_o.xml" test="infer_request_inference" device="GPU" vmsize="600507" vmpeak="672597" vmrss="247596" vmhwm="247596" />
+ <model path="mxnet/FP32/mtcnn_p/mtcnn_p.xml" test="create_exenetwork" device="CPU" vmsize="753530" vmpeak="881588" vmrss="13208" vmhwm="35261" />
+ <model path="mxnet/FP32/mtcnn_p/mtcnn_p.xml" test="create_exenetwork" device="GPU" vmsize="570042" vmpeak="661702" vmrss="224870" vmhwm="353003" />
+ <model path="mxnet/FP32/mtcnn_p/mtcnn_p.xml" test="infer_request_inference" device="CPU" vmsize="901260" vmpeak="901260" vmrss="107390" vmhwm="107390" />
+ <model path="mxnet/FP32/mtcnn_p/mtcnn_p.xml" test="infer_request_inference" device="GPU" vmsize="686408" vmpeak="758498" vmrss="332895" vmhwm="351907" />
+ <model path="mxnet/FP32/mtcnn_r/mtcnn_r.xml" test="create_exenetwork" device="CPU" vmsize="753711" vmpeak="803228" vmrss="14546" vmhwm="25586" />
+ <model path="mxnet/FP32/mtcnn_r/mtcnn_r.xml" test="create_exenetwork" device="GPU" vmsize="577288" vmpeak="667682" vmrss="231642" vmhwm="242167" />
+ <model path="mxnet/FP32/mtcnn_r/mtcnn_r.xml" test="infer_request_inference" device="CPU" vmsize="806102" vmpeak="806102" vmrss="24468" vmhwm="24468" />
+ <model path="mxnet/FP32/mtcnn_r/mtcnn_r.xml" test="infer_request_inference" device="GPU" vmsize="595588" vmpeak="667678" vmrss="242246" vmhwm="242246" />
+ <model path="mxnet/FP32/nin/nin.xml" test="create_exenetwork" device="CPU" vmsize="753838" vmpeak="907420" vmrss="80674" vmhwm="122086" />
+ <model path="mxnet/FP32/nin/nin.xml" test="create_exenetwork" device="GPU" vmsize="675633" vmpeak="798283" vmrss="330184" vmhwm="372754" />
+ <model path="mxnet/FP32/nin/nin.xml" test="infer_request_inference" device="CPU" vmsize="841390" vmpeak="913479" vmrss="123776" vmhwm="123776" />
+ <model path="mxnet/FP32/nin/nin.xml" test="infer_request_inference" device="GPU" vmsize="726066" vmpeak="798155" vmrss="390764" vmhwm="390764" />
+ <model path="mxnet/FP32/nst_vgg19/nst_vgg19.xml" test="create_exenetwork" device="CPU" vmsize="754080" vmpeak="884950" vmrss="35930" vmhwm="56368" />
+ <model path="mxnet/FP32/nst_vgg19/nst_vgg19.xml" test="create_exenetwork" device="GPU" vmsize="613082" vmpeak="713020" vmrss="267753" vmhwm="358019" />
+ <model path="mxnet/FP32/nst_vgg19/nst_vgg19.xml" test="infer_request_inference" device="CPU" vmsize="847726" vmpeak="919815" vmrss="83300" vmhwm="83300" />
+ <model path="mxnet/FP32/nst_vgg19/nst_vgg19.xml" test="infer_request_inference" device="GPU" vmsize="710754" vmpeak="782843" vmrss="357442" vmhwm="357442" />
+ <model path="mxnet/FP32/resnet_v1_101/resnet_v1_101.xml" test="create_exenetwork" device="CPU" vmsize="760821" vmpeak="1370292" vmrss="44242" vmhwm="618965" />
+ <model path="mxnet/FP32/resnet_v1_101/resnet_v1_101.xml" test="create_exenetwork" device="GPU" vmsize="1077643" vmpeak="1594964" vmrss="731733" vmhwm="1313127" />
+ <model path="mxnet/FP32/resnet_v1_101/resnet_v1_101.xml" test="infer_request_inference" device="CPU" vmsize="1256200" vmpeak="1370261" vmrss="444043" vmhwm="617852" />
+ <model path="mxnet/FP32/resnet_v1_101/resnet_v1_101.xml" test="infer_request_inference" device="GPU" vmsize="1494732" vmpeak="1596218" vmrss="1141690" vmhwm="1314187" />
+ <model path="mxnet/FP32/resnet_v1_152/resnet_v1_152.xml" test="create_exenetwork" device="CPU" vmsize="765322" vmpeak="1593790" vmrss="61120" vmhwm="831661" />
+ <model path="mxnet/FP32/resnet_v1_152/resnet_v1_152.xml" test="create_exenetwork" device="GPU" vmsize="1339184" vmpeak="2040148" vmrss="993968" vmhwm="1758746" />
+ <model path="mxnet/FP32/resnet_v1_152/resnet_v1_152.xml" test="infer_request_inference" device="CPU" vmsize="1414652" vmpeak="1593754" vmrss="594426" vmhwm="832220" />
+ <model path="mxnet/FP32/resnet_v1_152/resnet_v1_152.xml" test="infer_request_inference" device="GPU" vmsize="1871271" vmpeak="2037904" vmrss="1518501" vmhwm="1756343" />
+ <model path="mxnet/FP32/resnet_v2_101/resnet_v2_101.xml" test="create_exenetwork" device="CPU" vmsize="760650" vmpeak="1369557" vmrss="43384" vmhwm="618015" />
+ <model path="mxnet/FP32/resnet_v2_101/resnet_v2_101.xml" test="create_exenetwork" device="GPU" vmsize="1022863" vmpeak="1592206" vmrss="676698" vmhwm="1309880" />
+ <model path="mxnet/FP32/resnet_v2_101/resnet_v2_101.xml" test="infer_request_inference" device="CPU" vmsize="1255557" vmpeak="1369522" vmrss="445350" vmhwm="618750" />
+ <model path="mxnet/FP32/resnet_v2_101/resnet_v2_101.xml" test="infer_request_inference" device="GPU" vmsize="1490077" vmpeak="1591563" vmrss="1137444" vmhwm="1309910" />
+ <model path="mxnet/FP32/resnet_v2_152/resnet_v2_152.xml" test="create_exenetwork" device="CPU" vmsize="765204" vmpeak="1593108" vmrss="61124" vmhwm="831353" />
+ <model path="mxnet/FP32/resnet_v2_152/resnet_v2_152.xml" test="create_exenetwork" device="GPU" vmsize="1340754" vmpeak="2034586" vmrss="995636" vmhwm="1753100" />
+ <model path="mxnet/FP32/resnet_v2_152/resnet_v2_152.xml" test="infer_request_inference" device="CPU" vmsize="1413992" vmpeak="1593077" vmrss="592710" vmhwm="831098" />
+ <model path="mxnet/FP32/resnet_v2_152/resnet_v2_152.xml" test="infer_request_inference" device="GPU" vmsize="1867096" vmpeak="2036610" vmrss="1514532" vmhwm="1755089" />
+ <model path="mxnet/FP32/resnext_101/resnext_101.xml" test="create_exenetwork" device="CPU" vmsize="766911" vmpeak="1356080" vmrss="64389" vmhwm="623026" />
+ <model path="mxnet/FP32/resnext_101/resnext_101.xml" test="create_exenetwork" device="GPU" vmsize="1105068" vmpeak="1552320" vmrss="759990" vmhwm="1271340" />
+ <model path="mxnet/FP32/resnext_101/resnext_101.xml" test="infer_request_inference" device="CPU" vmsize="1258699" vmpeak="1356084" vmrss="468780" vmhwm="623788" />
+ <model path="mxnet/FP32/resnext_101/resnext_101.xml" test="infer_request_inference" device="GPU" vmsize="1478730" vmpeak="1553591" vmrss="1126364" vmhwm="1272167" />
+ <model path="mxnet/FP32/resnext_101_64x4d/resnext_101_64x4d.xml" test="create_exenetwork" device="CPU" vmsize="761239" vmpeak="1894468" vmrss="40691" vmhwm="1139410" />
+ <model path="mxnet/FP32/resnext_101_64x4d/resnext_101_64x4d.xml" test="create_exenetwork" device="GPU" vmsize="1418938" vmpeak="2248351" vmrss="1073886" vmhwm="1967262" />
+ <model path="mxnet/FP32/resnext_101_64x4d/resnext_101_64x4d.xml" test="infer_request_inference" device="CPU" vmsize="1618592" vmpeak="1894499" vmrss="810946" vmhwm="1140422" />
+ <model path="mxnet/FP32/resnext_101_64x4d/resnext_101_64x4d.xml" test="infer_request_inference" device="GPU" vmsize="1996112" vmpeak="2247322" vmrss="1660700" vmhwm="1965405" />
+ <model path="mxnet/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="create_exenetwork" device="CPU" vmsize="754987" vmpeak="880664" vmrss="29475" vmhwm="43832" />
+ <model path="mxnet/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="create_exenetwork" device="GPU" vmsize="616360" vmpeak="711106" vmrss="270859" vmhwm="322498" />
+ <model path="mxnet/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="infer_request_inference" device="CPU" vmsize="818562" vmpeak="818562" vmrss="47141" vmhwm="47141" />
+ <model path="mxnet/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="infer_request_inference" device="GPU" vmsize="674124" vmpeak="746213" vmrss="322731" vmhwm="322731" />
+ <model path="mxnet/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="create_exenetwork" device="CPU" vmsize="755224" vmpeak="1146433" vmrss="21806" vmhwm="370044" />
+ <model path="mxnet/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="create_exenetwork" device="GPU" vmsize="775324" vmpeak="1077709" vmrss="430342" vmhwm="796857" />
+ <model path="mxnet/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="infer_request_inference" device="CPU" vmsize="1113904" vmpeak="1185993" vmrss="312527" vmhwm="370946" />
+ <model path="mxnet/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="infer_request_inference" device="GPU" vmsize="1137391" vmpeak="1137391" vmrss="785391" vmhwm="793201" />
+ <model path="mxnet/FP32/vgg16/vgg16.xml" test="create_exenetwork" device="CPU" vmsize="754133" vmpeak="2548906" vmrss="14955" vmhwm="1807044" />
+ <model path="mxnet/FP32/vgg16/vgg16.xml" test="create_exenetwork" device="GPU" vmsize="668619" vmpeak="3326725" vmrss="322691" vmhwm="3044404" />
+ <model path="mxnet/FP32/vgg16/vgg16.xml" test="infer_request_inference" device="CPU" vmsize="2027476" vmpeak="2548906" vmrss="1242678" vmhwm="1808470" />
+ <model path="mxnet/FP32/vgg16/vgg16.xml" test="infer_request_inference" device="GPU" vmsize="2438563" vmpeak="3326725" vmrss="2085028" vmhwm="3044505" />
+ <model path="mxnet/FP32/vgg19/vgg19.xml" test="create_exenetwork" device="CPU" vmsize="754226" vmpeak="2618325" vmrss="15708" vmhwm="1877977" />
+ <model path="mxnet/FP32/vgg19/vgg19.xml" test="create_exenetwork" device="GPU" vmsize="741092" vmpeak="3397116" vmrss="396074" vmhwm="3115345" />
+ <model path="mxnet/FP32/vgg19/vgg19.xml" test="infer_request_inference" device="CPU" vmsize="2074089" vmpeak="2618325" vmrss="1290049" vmhwm="1878672" />
+ <model path="mxnet/FP32/vgg19/vgg19.xml" test="infer_request_inference" device="GPU" vmsize="2518436" vmpeak="3397178" vmrss="2165728" vmhwm="3115459" />
+ <model path="mxnet/FP32/yolo_v1_full/yolo_v1_full.xml" test="create_exenetwork" device="CPU" vmsize="754701" vmpeak="4259684" vmrss="17626" vmhwm="3531853" />
+ <model path="mxnet/FP32/yolo_v1_full/yolo_v1_full.xml" test="create_exenetwork" device="GPU" vmsize="747582" vmpeak="5921322" vmrss="402490" vmhwm="5639084" />
+ <model path="mxnet/FP32/yolo_v1_full/yolo_v1_full.xml" test="infer_request_inference" device="CPU" vmsize="3095241" vmpeak="4259670" vmrss="2379062" vmhwm="3530652" />
+ <model path="mxnet/FP32/yolo_v1_full/yolo_v1_full.xml" test="infer_request_inference" device="GPU" vmsize="4163667" vmpeak="5923566" vmrss="3810193" vmhwm="5640967" />
+ <model path="mxnet/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="create_exenetwork" device="CPU" vmsize="754023" vmpeak="1334414" vmrss="15254" vmhwm="608322" />
+ <model path="mxnet/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="create_exenetwork" device="GPU" vmsize="600701" vmpeak="1330978" vmrss="255912" vmhwm="1049844" />
+ <model path="mxnet/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="infer_request_inference" device="CPU" vmsize="1215838" vmpeak="1334383" vmrss="428331" vmhwm="607442" />
+ <model path="mxnet/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="infer_request_inference" device="GPU" vmsize="1199972" vmpeak="1330384" vmrss="847391" vmhwm="1049228" />
+ <model path="onnx/FP32/ssd_resnet34/ssd_resnet34.xml" test="create_exenetwork" device="CPU" vmsize="755387" vmpeak="1175570" vmrss="25374" vmhwm="306904" />
+ <model path="onnx/FP32/ssd_resnet34/ssd_resnet34.xml" test="create_exenetwork" device="GPU" vmsize="805222" vmpeak="1346307" vmrss="460781" vmhwm="1065873" />
+ <model path="onnx/FP32/ssd_resnet34/ssd_resnet34.xml" test="infer_request_inference" device="CPU" vmsize="1188580" vmpeak="1260670" vmrss="336036" vmhwm="336036" />
+ <model path="onnx/FP32/ssd_resnet34/ssd_resnet34.xml" test="infer_request_inference" device="GPU" vmsize="1449408" vmpeak="1521498" vmrss="1096792" vmhwm="1096792" />
+ <model path="onnx/FP32/ssd_resnet34_new/ssd_resnet34_new.xml" test="create_exenetwork" device="CPU" vmsize="756822" vmpeak="1181615" vmrss="28468" vmhwm="309716" />
+ <model path="onnx/FP32/ssd_resnet34_new/ssd_resnet34_new.xml" test="create_exenetwork" device="GPU" vmsize="819271" vmpeak="2432738" vmrss="474764" vmhwm="1101047" />
+ <model path="onnx/FP32/ssd_resnet34_new/ssd_resnet34_new.xml" test="infer_request_inference" device="CPU" vmsize="1189117" vmpeak="1261207" vmrss="333788" vmhwm="333788" />
+ <model path="onnx/FP32/ssd_resnet34_new/ssd_resnet34_new.xml" test="infer_request_inference" device="GPU" vmsize="2539222" vmpeak="2611312" vmrss="2191604" vmhwm="2191604" />
+ <model path="pytorch/FP32/inceptionv3_pretrained/inceptionv3_pretrained.xml" test="create_exenetwork" device="CPU" vmsize="757878" vmpeak="1077934" vmrss="35261" vmhwm="348964" />
+ <model path="pytorch/FP32/inceptionv3_pretrained/inceptionv3_pretrained.xml" test="create_exenetwork" device="GPU" vmsize="899610" vmpeak="1179116" vmrss="553863" vmhwm="896997" />
+ <model path="pytorch/FP32/inceptionv3_pretrained/inceptionv3_pretrained.xml" test="infer_request_inference" device="CPU" vmsize="1050878" vmpeak="1077876" vmrss="256506" vmhwm="347974" />
+ <model path="pytorch/FP32/inceptionv3_pretrained/inceptionv3_pretrained.xml" test="infer_request_inference" device="GPU" vmsize="1179239" vmpeak="1251329" vmrss="826553" vmhwm="897714" />
+ <model path="pytorch/FP32/resnet50_pretrained/resnet50_pretrained.xml" test="create_exenetwork" device="CPU" vmsize="760456" vmpeak="1096708" vmrss="27315" vmhwm="361944" />
+ <model path="pytorch/FP32/resnet50_pretrained/resnet50_pretrained.xml" test="create_exenetwork" device="GPU" vmsize="834275" vmpeak="1073569" vmrss="489086" vmhwm="792343" />
+ <model path="pytorch/FP32/resnet50_pretrained/resnet50_pretrained.xml" test="infer_request_inference" device="CPU" vmsize="1058622" vmpeak="1130712" vmrss="267682" vmhwm="362749" />
+ <model path="pytorch/FP32/resnet50_pretrained/resnet50_pretrained.xml" test="infer_request_inference" device="GPU" vmsize="1050852" vmpeak="1122941" vmrss="697576" vmhwm="791040" />
+ <model path="pytorch/FP32/resnet50_torchvision/resnet50_torchvision.xml" test="create_exenetwork" device="CPU" vmsize="755950" vmpeak="1092203" vmrss="27640" vmhwm="362740" />
+ <model path="pytorch/FP32/resnet50_torchvision/resnet50_torchvision.xml" test="create_exenetwork" device="GPU" vmsize="835951" vmpeak="1073516" vmrss="490674" vmhwm="792224" />
+ <model path="pytorch/FP32/resnet50_torchvision/resnet50_torchvision.xml" test="infer_request_inference" device="CPU" vmsize="1058626" vmpeak="1130716" vmrss="266516" vmhwm="361992" />
+ <model path="pytorch/FP32/resnet50_torchvision/resnet50_torchvision.xml" test="infer_request_inference" device="GPU" vmsize="1050218" vmpeak="1071435" vmrss="696669" vmhwm="789848" />
+ <model path="pytorch/FP32/squeezenet_v1.1_pretrained/squeezenet_v1.1_pretrained.xml" test="create_exenetwork" device="CPU" vmsize="754872" vmpeak="880550" vmrss="29603" vmhwm="43212" />
+ <model path="pytorch/FP32/squeezenet_v1.1_pretrained/squeezenet_v1.1_pretrained.xml" test="create_exenetwork" device="GPU" vmsize="648881" vmpeak="743626" vmrss="303424" vmhwm="318348" />
+ <model path="pytorch/FP32/squeezenet_v1.1_pretrained/squeezenet_v1.1_pretrained.xml" test="infer_request_inference" device="CPU" vmsize="818246" vmpeak="818246" vmrss="46534" vmhwm="46534" />
+ <model path="pytorch/FP32/squeezenet_v1.1_pretrained/squeezenet_v1.1_pretrained.xml" test="infer_request_inference" device="GPU" vmsize="674146" vmpeak="746235" vmrss="320315" vmhwm="320315" />
+ <model path="tf/1.14.0/FP32/bert_base_uncased/bert_base_uncased.xml" test="create_exenetwork" device="CPU" vmsize="764755" vmpeak="2092574" vmrss="38016" vmhwm="1352450" />
+ <model path="tf/1.14.0/FP32/bert_base_uncased/bert_base_uncased.xml" test="create_exenetwork" device="GPU" vmsize="1578328" vmpeak="3355976" vmrss="1233474" vmhwm="3074953" />
+ <model path="tf/1.14.0/FP32/bert_base_uncased/bert_base_uncased.xml" test="infer_request_inference" device="CPU" vmsize="1802838" vmpeak="2092587" vmrss="994188" vmhwm="1352709" />
+ <model path="tf/1.14.0/FP32/bert_base_uncased/bert_base_uncased.xml" test="infer_request_inference" device="GPU" vmsize="2958472" vmpeak="3352694" vmrss="2607677" vmhwm="3072185" />
+ <model path="tf/1.14.0/FP32/bert_xnli/bert_xnli.xml" test="create_exenetwork" device="CPU" vmsize="765124" vmpeak="2035453" vmrss="39745" vmhwm="1292420" />
+ <model path="tf/1.14.0/FP32/bert_xnli/bert_xnli.xml" test="create_exenetwork" device="GPU" vmsize="1939801" vmpeak="3261715" vmrss="1594617" vmhwm="2980577" />
+ <model path="tf/1.14.0/FP32/bert_xnli/bert_xnli.xml" test="infer_request_inference" device="CPU" vmsize="1750196" vmpeak="2039945" vmrss="935774" vmhwm="1291963" />
+ <model path="tf/1.14.0/FP32/bert_xnli/bert_xnli.xml" test="infer_request_inference" device="GPU" vmsize="2902235" vmpeak="3265460" vmrss="2551727" vmhwm="2984352" />
+ <model path="tf/1.14.0/FP32/cmu/cmu.xml" test="create_exenetwork" device="CPU" vmsize="757587" vmpeak="1547678" vmrss="33004" vmhwm="718973" />
+ <model path="tf/1.14.0/FP32/cmu/cmu.xml" test="create_exenetwork" device="GPU" vmsize="1154670" vmpeak="1678943" vmrss="809811" vmhwm="1398284" />
+ <model path="tf/1.14.0/FP32/cmu/cmu.xml" test="infer_request_inference" device="CPU" vmsize="1553134" vmpeak="1553134" vmrss="606232" vmhwm="719791" />
+ <model path="tf/1.14.0/FP32/cmu/cmu.xml" test="infer_request_inference" device="GPU" vmsize="1753910" vmpeak="1826000" vmrss="1400234" vmhwm="1400234" />
+ <model path="tf/1.14.0/FP32/deeplab_v3/deeplab_v3.xml" test="create_exenetwork" device="CPU" vmsize="757160" vmpeak="867486" vmrss="41307" vmhwm="62678" />
+ <model path="tf/1.14.0/FP32/deeplab_v3/deeplab_v3.xml" test="create_exenetwork" device="GPU" vmsize="743283" vmpeak="841055" vmrss="398604" vmhwm="537209" />
+ <model path="tf/1.14.0/FP32/deeplab_v3/deeplab_v3.xml" test="infer_request_inference" device="CPU" vmsize="888087" vmpeak="960176" vmrss="114166" vmhwm="114166" />
+ <model path="tf/1.14.0/FP32/deeplab_v3/deeplab_v3.xml" test="infer_request_inference" device="GPU" vmsize="894339" vmpeak="966429" vmrss="541912" vmhwm="541912" />
+ <model path="tf/1.14.0/FP32/densenet_121/densenet_121.xml" test="create_exenetwork" device="CPU" vmsize="772728" vmpeak="951218" vmrss="95840" vmhwm="151676" />
+ <model path="tf/1.14.0/FP32/densenet_121/densenet_121.xml" test="create_exenetwork" device="GPU" vmsize="1135195" vmpeak="1245301" vmrss="789848" vmhwm="820410" />
+ <model path="tf/1.14.0/FP32/densenet_121/densenet_121.xml" test="infer_request_inference" device="CPU" vmsize="985450" vmpeak="1057540" vmrss="159046" vmhwm="159046" />
+ <model path="tf/1.14.0/FP32/densenet_121/densenet_121.xml" test="infer_request_inference" device="GPU" vmsize="1171152" vmpeak="1243242" vmrss="818598" vmhwm="818598" />
+ <model path="tf/1.14.0/FP32/densenet_169/densenet_169.xml" test="create_exenetwork" device="CPU" vmsize="864168" vmpeak="998263" vmrss="126266" vmhwm="241604" />
+ <model path="tf/1.14.0/FP32/densenet_169/densenet_169.xml" test="create_exenetwork" device="GPU" vmsize="1353237" vmpeak="1472583" vmrss="1007978" vmhwm="1094614" />
+ <model path="tf/1.14.0/FP32/densenet_169/densenet_169.xml" test="infer_request_inference" device="CPU" vmsize="1060316" vmpeak="1132406" vmrss="238326" vmhwm="240724" />
+ <model path="tf/1.14.0/FP32/densenet_169/densenet_169.xml" test="infer_request_inference" device="GPU" vmsize="1447146" vmpeak="1519236" vmrss="1094759" vmhwm="1097835" />
+ <model path="tf/1.14.0/FP32/dssd_avigilon/dssd_avigilon.xml" test="create_exenetwork" device="CPU" vmsize="757156" vmpeak="826843" vmrss="69031" vmhwm="100887" />
+ <model path="tf/1.14.0/FP32/dssd_avigilon/dssd_avigilon.xml" test="create_exenetwork" device="GPU" vmsize="796250" vmpeak="906813" vmrss="451171" vmhwm="482077" />
+ <model path="tf/1.14.0/FP32/dssd_avigilon/dssd_avigilon.xml" test="infer_request_inference" device="CPU" vmsize="849041" vmpeak="849041" vmrss="104464" vmhwm="104464" />
+ <model path="tf/1.14.0/FP32/dssd_avigilon/dssd_avigilon.xml" test="infer_request_inference" device="GPU" vmsize="833984" vmpeak="906074" vmrss="481786" vmhwm="481786" />
+ <model path="tf/1.14.0/FP32/facenet/facenet.xml" test="create_exenetwork" device="CPU" vmsize="760786" vmpeak="1139173" vmrss="66413" vmhwm="353346" />
+ <model path="tf/1.14.0/FP32/facenet/facenet.xml" test="create_exenetwork" device="GPU" vmsize="1055560" vmpeak="1255601" vmrss="710595" vmhwm="974815" />
+ <model path="tf/1.14.0/FP32/facenet/facenet.xml" test="infer_request_inference" device="CPU" vmsize="1097984" vmpeak="1170074" vmrss="281050" vmhwm="352228" />
+ <model path="tf/1.14.0/FP32/facenet/facenet.xml" test="infer_request_inference" device="GPU" vmsize="1259253" vmpeak="1331343" vmrss="906562" vmhwm="976483" />
+ <model path="tf/1.14.0/FP32/faster_rcnn_inception_resnet_v2_atrous_coco/faster_rcnn_inception_resnet_v2_atrous_coco.xml" test="create_exenetwork" device="CPU" vmsize="920884" vmpeak="2443892" vmrss="237186" vmhwm="851215" />
+ <model path="tf/1.14.0/FP32/faster_rcnn_inception_resnet_v2_atrous_coco/faster_rcnn_inception_resnet_v2_atrous_coco.xml" test="create_exenetwork" device="GPU" vmsize="1751376" vmpeak="4164239" vmrss="1406411" vmhwm="3883422" />
+ <model path="tf/1.14.0/FP32/faster_rcnn_inception_v2_coco/faster_rcnn_inception_v2_coco.xml" test="create_exenetwork" device="CPU" vmsize="757323" vmpeak="986519" vmrss="35006" vmhwm="212911" />
+ <model path="tf/1.14.0/FP32/faster_rcnn_inception_v2_coco/faster_rcnn_inception_v2_coco.xml" test="create_exenetwork" device="GPU" vmsize="862219" vmpeak="1179283" vmrss="516881" vmhwm="897930" />
+ <model path="tf/1.14.0/FP32/faster_rcnn_resnet101_coco/faster_rcnn_resnet101_coco.xml" test="create_exenetwork" device="CPU" vmsize="761538" vmpeak="1491811" vmrss="45667" vmhwm="671554" />
+ <model path="tf/1.14.0/FP32/faster_rcnn_resnet101_coco/faster_rcnn_resnet101_coco.xml" test="create_exenetwork" device="GPU" vmsize="1126884" vmpeak="1800550" vmrss="781739" vmhwm="1519302" />
+ <model path="tf/1.14.0/FP32/faster_rcnn_resnet50_coco/faster_rcnn_resnet50_coco.xml" test="create_exenetwork" device="CPU" vmsize="766964" vmpeak="1233342" vmrss="29568" vmhwm="415509" />
+ <model path="tf/1.14.0/FP32/faster_rcnn_resnet50_coco/faster_rcnn_resnet50_coco.xml" test="create_exenetwork" device="GPU" vmsize="897432" vmpeak="1347007" vmrss="553357" vmhwm="1067290" />
+ <model path="tf/1.14.0/FP32/i3d_rgb/i3d_rgb.xml" test="create_exenetwork" device="CPU" vmsize="756562" vmpeak="1099533" vmrss="30078" vmhwm="245590" />
+ <model path="tf/1.14.0/FP32/i3d_rgb/i3d_rgb.xml" test="create_exenetwork" device="GPU" vmsize="764170" vmpeak="1353149" vmrss="419267" vmhwm="1072244" />
+ <model path="tf/1.14.0/FP32/i3d_rgb/i3d_rgb.xml" test="infer_request_inference" device="CPU" vmsize="1478496" vmpeak="1478496" vmrss="332820" vmhwm="332820" />
+ <model path="tf/1.14.0/FP32/i3d_rgb/i3d_rgb.xml" test="infer_request_inference" device="GPU" vmsize="1423364" vmpeak="1495454" vmrss="1070973" vmhwm="1172441" />
+ <model path="tf/1.14.0/FP32/icv_squeezenet_v1.0/icv_squeezenet_v1.0.xml" test="create_exenetwork" device="CPU" vmsize="755092" vmpeak="815298" vmrss="28811" vmhwm="43687" />
+ <model path="tf/1.14.0/FP32/icv_squeezenet_v1.0/icv_squeezenet_v1.0.xml" test="create_exenetwork" device="GPU" vmsize="620734" vmpeak="715479" vmrss="274991" vmhwm="324935" />
+ <model path="tf/1.14.0/FP32/icv_squeezenet_v1.0/icv_squeezenet_v1.0.xml" test="infer_request_inference" device="CPU" vmsize="825268" vmpeak="825268" vmrss="48439" vmhwm="48439" />
+ <model path="tf/1.14.0/FP32/icv_squeezenet_v1.0/icv_squeezenet_v1.0.xml" test="infer_request_inference" device="GPU" vmsize="680592" vmpeak="752681" vmrss="326972" vmhwm="326972" />
+ <model path="tf/1.14.0/FP32/icv_squeezenet_v1.1/icv_squeezenet_v1.1.xml" test="create_exenetwork" device="CPU" vmsize="765182" vmpeak="880712" vmrss="29827" vmhwm="44149" />
+ <model path="tf/1.14.0/FP32/icv_squeezenet_v1.1/icv_squeezenet_v1.1.xml" test="create_exenetwork" device="GPU" vmsize="612620" vmpeak="707366" vmrss="266855" vmhwm="323734" />
+ <model path="tf/1.14.0/FP32/icv_squeezenet_v1.1/icv_squeezenet_v1.1.xml" test="infer_request_inference" device="CPU" vmsize="818879" vmpeak="818879" vmrss="46534" vmhwm="46534" />
+ <model path="tf/1.14.0/FP32/icv_squeezenet_v1.1/icv_squeezenet_v1.1.xml" test="infer_request_inference" device="GPU" vmsize="681010" vmpeak="753099" vmrss="326902" vmhwm="326902" />
+ <model path="tf/1.14.0/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="create_exenetwork" device="CPU" vmsize="848056" vmpeak="1522360" vmrss="147382" vmhwm="794481" />
+ <model path="tf/1.14.0/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="create_exenetwork" device="GPU" vmsize="1699992" vmpeak="2187231" vmrss="1354892" vmhwm="1906344" />
+ <model path="tf/1.14.0/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="infer_request_inference" device="CPU" vmsize="1437365" vmpeak="1522364" vmrss="643724" vmhwm="793755" />
+ <model path="tf/1.14.0/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="infer_request_inference" device="GPU" vmsize="2152515" vmpeak="2224604" vmrss="1800026" vmhwm="1900395" />
+ <model path="tf/1.14.0/FP32/inception_v1/inception_v1.xml" test="create_exenetwork" device="CPU" vmsize="757526" vmpeak="905132" vmrss="83195" vmhwm="119653" />
+ <model path="tf/1.14.0/FP32/inception_v1/inception_v1.xml" test="create_exenetwork" device="GPU" vmsize="815988" vmpeak="932663" vmrss="470742" vmhwm="507760" />
+ <model path="tf/1.14.0/FP32/inception_v1/inception_v1.xml" test="infer_request_inference" device="CPU" vmsize="1007820" vmpeak="1007820" vmrss="123926" vmhwm="123926" />
+ <model path="tf/1.14.0/FP32/inception_v1/inception_v1.xml" test="infer_request_inference" device="GPU" vmsize="861520" vmpeak="933609" vmrss="507870" vmhwm="507870" />
+ <model path="tf/1.14.0/FP32/inception_v2/inception_v2.xml" test="create_exenetwork" device="CPU" vmsize="756756" vmpeak="925425" vmrss="34007" vmhwm="180769" />
+ <model path="tf/1.14.0/FP32/inception_v2/inception_v2.xml" test="create_exenetwork" device="GPU" vmsize="824168" vmpeak="962403" vmrss="478737" vmhwm="610280" />
+ <model path="tf/1.14.0/FP32/inception_v2/inception_v2.xml" test="infer_request_inference" device="CPU" vmsize="927669" vmpeak="999759" vmrss="141772" vmhwm="181966" />
+ <model path="tf/1.14.0/FP32/inception_v2/inception_v2.xml" test="infer_request_inference" device="GPU" vmsize="936755" vmpeak="1008845" vmrss="583963" vmhwm="611516" />
+ <model path="tf/1.14.0/FP32/inception_v3/inception_v3.xml" test="create_exenetwork" device="CPU" vmsize="759013" vmpeak="1063559" vmrss="51255" vmhwm="349113" />
+ <model path="tf/1.14.0/FP32/inception_v3/inception_v3.xml" test="create_exenetwork" device="GPU" vmsize="925958" vmpeak="1184101" vmrss="580056" vmhwm="902325" />
+ <model path="tf/1.14.0/FP32/inception_v3/inception_v3.xml" test="infer_request_inference" device="CPU" vmsize="1043583" vmpeak="1115672" vmrss="263520" vmhwm="349034" />
+ <model path="tf/1.14.0/FP32/inception_v3/inception_v3.xml" test="infer_request_inference" device="GPU" vmsize="1189548" vmpeak="1261638" vmrss="836646" vmhwm="903676" />
+ <model path="tf/1.14.0/FP32/inception_v4/inception_v4.xml" test="create_exenetwork" device="CPU" vmsize="764574" vmpeak="1327493" vmrss="64108" vmhwm="603842" />
+ <model path="tf/1.14.0/FP32/inception_v4/inception_v4.xml" test="create_exenetwork" device="GPU" vmsize="1221717" vmpeak="1686643" vmrss="875617" vmhwm="1404475" />
+ <model path="tf/1.14.0/FP32/inception_v4/inception_v4.xml" test="infer_request_inference" device="CPU" vmsize="1381556" vmpeak="1403402" vmrss="440356" vmhwm="602751" />
+ <model path="tf/1.14.0/FP32/inception_v4/inception_v4.xml" test="infer_request_inference" device="GPU" vmsize="1641921" vmpeak="1714011" vmrss="1289340" vmhwm="1405430" />
+ <model path="tf/1.14.0/FP32/mask_rcnn_resnet101_atrous_coco/mask_rcnn_resnet101_atrous_coco.xml" test="create_exenetwork" device="CPU" vmsize="762119" vmpeak="2738828" vmrss="47203" vmhwm="947557" />
+ <model path="tf/1.14.0/FP32/mask_rcnn_resnet101_atrous_coco/mask_rcnn_resnet101_atrous_coco.xml" test="create_exenetwork" device="GPU" vmsize="1295483" vmpeak="4189812" vmrss="949788" vmhwm="3908550" />
+ <model path="tf/1.14.0/FP32/mobilenet_v1_0.25_128/mobilenet_v1_0.25_128.xml" test="create_exenetwork" device="CPU" vmsize="763840" vmpeak="805556" vmrss="21938" vmhwm="33264" />
+ <model path="tf/1.14.0/FP32/mobilenet_v1_0.25_128/mobilenet_v1_0.25_128.xml" test="create_exenetwork" device="GPU" vmsize="652572" vmpeak="744180" vmrss="306754" vmhwm="318432" />
+ <model path="tf/1.14.0/FP32/mobilenet_v1_0.25_128/mobilenet_v1_0.25_128.xml" test="infer_request_inference" device="CPU" vmsize="814000" vmpeak="814000" vmrss="33391" vmhwm="33391" />
+ <model path="tf/1.14.0/FP32/mobilenet_v1_0.25_128/mobilenet_v1_0.25_128.xml" test="infer_request_inference" device="GPU" vmsize="672144" vmpeak="744233" vmrss="319026" vmhwm="319026" />
+ <model path="tf/1.14.0/FP32/mobilenet_v1_0.5_160/mobilenet_v1_0.5_160.xml" test="create_exenetwork" device="CPU" vmsize="754705" vmpeak="881188" vmrss="29282" vmhwm="44836" />
+ <model path="tf/1.14.0/FP32/mobilenet_v1_0.5_160/mobilenet_v1_0.5_160.xml" test="create_exenetwork" device="GPU" vmsize="614209" vmpeak="709759" vmrss="268778" vmhwm="326845" />
+ <model path="tf/1.14.0/FP32/mobilenet_v1_0.5_160/mobilenet_v1_0.5_160.xml" test="infer_request_inference" device="CPU" vmsize="818228" vmpeak="890318" vmrss="45513" vmhwm="45513" />
+ <model path="tf/1.14.0/FP32/mobilenet_v1_0.5_160/mobilenet_v1_0.5_160.xml" test="infer_request_inference" device="GPU" vmsize="682484" vmpeak="754573" vmrss="328966" vmhwm="328966" />
+ <model path="tf/1.14.0/FP32/mobilenet_v1_1.0_224/mobilenet_v1_1.0_224.xml" test="create_exenetwork" device="CPU" vmsize="754903" vmpeak="821928" vmrss="55237" vmhwm="82768" />
+ <model path="tf/1.14.0/FP32/mobilenet_v1_1.0_224/mobilenet_v1_1.0_224.xml" test="create_exenetwork" device="GPU" vmsize="643887" vmpeak="751788" vmrss="298685" vmhwm="367602" />
+ <model path="tf/1.14.0/FP32/mobilenet_v1_1.0_224/mobilenet_v1_1.0_224.xml" test="infer_request_inference" device="CPU" vmsize="831111" vmpeak="831111" vmrss="86732" vmhwm="86732" />
+ <model path="tf/1.14.0/FP32/mobilenet_v1_1.0_224/mobilenet_v1_1.0_224.xml" test="infer_request_inference" device="GPU" vmsize="720979" vmpeak="793069" vmrss="367584" vmhwm="367584" />
+ <model path="tf/1.14.0/FP32/mobilenet_v2_1.0_224/mobilenet_v2_1.0_224.xml" test="create_exenetwork" device="CPU" vmsize="756870" vmpeak="819759" vmrss="54586" vmhwm="78570" />
+ <model path="tf/1.14.0/FP32/mobilenet_v2_1.0_224/mobilenet_v2_1.0_224.xml" test="create_exenetwork" device="GPU" vmsize="705724" vmpeak="809490" vmrss="360267" vmhwm="435512" />
+ <model path="tf/1.14.0/FP32/mobilenet_v2_1.0_224/mobilenet_v2_1.0_224.xml" test="infer_request_inference" device="CPU" vmsize="835978" vmpeak="835978" vmrss="82583" vmhwm="82583" />
+ <model path="tf/1.14.0/FP32/mobilenet_v2_1.0_224/mobilenet_v2_1.0_224.xml" test="infer_request_inference" device="GPU" vmsize="788902" vmpeak="860992" vmrss="435727" vmhwm="435727" />
+ <model path="tf/1.14.0/FP32/mobilenet_v2_1.4_224/mobilenet_v2_1.4_224.xml" test="create_exenetwork" device="CPU" vmsize="756725" vmpeak="831080" vmrss="76414" vmhwm="111914" />
+ <model path="tf/1.14.0/FP32/mobilenet_v2_1.4_224/mobilenet_v2_1.4_224.xml" test="create_exenetwork" device="GPU" vmsize="787058" vmpeak="902290" vmrss="441399" vmhwm="476911" />
+ <model path="tf/1.14.0/FP32/mobilenet_v2_1.4_224/mobilenet_v2_1.4_224.xml" test="infer_request_inference" device="CPU" vmsize="847299" vmpeak="847299" vmrss="120969" vmhwm="120969" />
+ <model path="tf/1.14.0/FP32/mobilenet_v2_1.4_224/mobilenet_v2_1.4_224.xml" test="infer_request_inference" device="GPU" vmsize="828920" vmpeak="901010" vmrss="475939" vmhwm="475939" />
+ <model path="tf/1.14.0/FP32/ncf/ncf.xml" test="create_exenetwork" device="CPU" vmsize="760988" vmpeak="1018754" vmrss="14484" vmhwm="296612" />
+ <model path="tf/1.14.0/FP32/ncf/ncf.xml" test="create_exenetwork" device="GPU" vmsize="600859" vmpeak="965967" vmrss="255569" vmhwm="685150" />
+ <model path="tf/1.14.0/FP32/ncf/ncf.xml" test="infer_request_inference" device="CPU" vmsize="1095155" vmpeak="1167245" vmrss="304607" vmhwm="304607" />
+ <model path="tf/1.14.0/FP32/ncf/ncf.xml" test="infer_request_inference" device="GPU" vmsize="1004577" vmpeak="1076666" vmrss="651943" vmhwm="689915" />
+ <model path="tf/1.14.0/FP32/resnet_v1.5_50/resnet_v1.5_50.xml" test="create_exenetwork" device="CPU" vmsize="756096" vmpeak="1100136" vmrss="27812" vmhwm="362344" />
+ <model path="tf/1.14.0/FP32/resnet_v1.5_50/resnet_v1.5_50.xml" test="create_exenetwork" device="GPU" vmsize="822830" vmpeak="1073947" vmrss="477193" vmhwm="792264" />
+ <model path="tf/1.14.0/FP32/resnet_v1.5_50/resnet_v1.5_50.xml" test="infer_request_inference" device="CPU" vmsize="1060571" vmpeak="1132661" vmrss="269808" vmhwm="362771" />
+ <model path="tf/1.14.0/FP32/resnet_v1.5_50/resnet_v1.5_50.xml" test="infer_request_inference" device="GPU" vmsize="1054684" vmpeak="1075272" vmrss="702310" vmhwm="794314" />
+ <model path="tf/1.14.0/FP32/resnet_v1_101/resnet_v1_101.xml" test="create_exenetwork" device="CPU" vmsize="760764" vmpeak="1338383" vmrss="42706" vmhwm="617047" />
+ <model path="tf/1.14.0/FP32/resnet_v1_101/resnet_v1_101.xml" test="create_exenetwork" device="GPU" vmsize="1108602" vmpeak="1561885" vmrss="762616" vmhwm="1279700" />
+ <model path="tf/1.14.0/FP32/resnet_v1_101/resnet_v1_101.xml" test="infer_request_inference" device="CPU" vmsize="1279819" vmpeak="1338409" vmrss="435102" vmhwm="617865" />
+ <model path="tf/1.14.0/FP32/resnet_v1_101/resnet_v1_101.xml" test="infer_request_inference" device="GPU" vmsize="1455146" vmpeak="1561388" vmrss="1101755" vmhwm="1279845" />
+ <model path="tf/1.14.0/FP32/resnet_v1_152/resnet_v1_152.xml" test="create_exenetwork" device="CPU" vmsize="765221" vmpeak="1552262" vmrss="59875" vmhwm="829250" />
+ <model path="tf/1.14.0/FP32/resnet_v1_152/resnet_v1_152.xml" test="create_exenetwork" device="GPU" vmsize="1322098" vmpeak="1985359" vmrss="976223" vmhwm="1703319" />
+ <model path="tf/1.14.0/FP32/resnet_v1_152/resnet_v1_152.xml" test="infer_request_inference" device="CPU" vmsize="1373006" vmpeak="1552293" vmrss="581891" vmhwm="829848" />
+ <model path="tf/1.14.0/FP32/resnet_v1_152/resnet_v1_152.xml" test="infer_request_inference" device="GPU" vmsize="1814348" vmpeak="1986380" vmrss="1461099" vmhwm="1704714" />
+ <model path="tf/1.14.0/FP32/resnet_v1_50/resnet_v1_50.xml" test="create_exenetwork" device="CPU" vmsize="766088" vmpeak="1079958" vmrss="27324" vmhwm="362155" />
+ <model path="tf/1.14.0/FP32/resnet_v1_50/resnet_v1_50.xml" test="create_exenetwork" device="GPU" vmsize="838965" vmpeak="1085884" vmrss="493407" vmhwm="804324" />
+ <model path="tf/1.14.0/FP32/resnet_v1_50/resnet_v1_50.xml" test="infer_request_inference" device="CPU" vmsize="1046157" vmpeak="1118246" vmrss="260515" vmhwm="362810" />
+ <model path="tf/1.14.0/FP32/resnet_v1_50/resnet_v1_50.xml" test="infer_request_inference" device="GPU" vmsize="1057223" vmpeak="1080772" vmrss="704066" vmhwm="799440" />
+ <model path="tf/1.14.0/FP32/resnet_v2_101/resnet_v2_101.xml" test="create_exenetwork" device="CPU" vmsize="761754" vmpeak="1365104" vmrss="45179" vmhwm="620879" />
+ <model path="tf/1.14.0/FP32/resnet_v2_101/resnet_v2_101.xml" test="create_exenetwork" device="GPU" vmsize="1120737" vmpeak="1613546" vmrss="774637" vmhwm="1331308" />
+ <model path="tf/1.14.0/FP32/resnet_v2_101/resnet_v2_101.xml" test="infer_request_inference" device="CPU" vmsize="1251346" vmpeak="1365135" vmrss="446415" vmhwm="620241" />
+ <model path="tf/1.14.0/FP32/resnet_v2_101/resnet_v2_101.xml" test="infer_request_inference" device="GPU" vmsize="1515817" vmpeak="1613858" vmrss="1162572" vmhwm="1331968" />
+ <model path="tf/1.14.0/FP32/resnet_v2_152/resnet_v2_152.xml" test="create_exenetwork" device="CPU" vmsize="839823" vmpeak="1569361" vmrss="155029" vmhwm="833157" />
+ <model path="tf/1.14.0/FP32/resnet_v2_152/resnet_v2_152.xml" test="create_exenetwork" device="GPU" vmsize="1363960" vmpeak="2068752" vmrss="1018507" vmhwm="1787042" />
+ <model path="tf/1.14.0/FP32/resnet_v2_152/resnet_v2_152.xml" test="infer_request_inference" device="CPU" vmsize="1476041" vmpeak="1569392" vmrss="679918" vmhwm="833914" />
+ <model path="tf/1.14.0/FP32/resnet_v2_152/resnet_v2_152.xml" test="infer_request_inference" device="GPU" vmsize="1904799" vmpeak="2060317" vmrss="1551756" vmhwm="1778167" />
+ <model path="tf/1.14.0/FP32/resnet_v2_50/resnet_v2_50.xml" test="create_exenetwork" device="CPU" vmsize="756602" vmpeak="1096774" vmrss="28393" vmhwm="363391" />
+ <model path="tf/1.14.0/FP32/resnet_v2_50/resnet_v2_50.xml" test="create_exenetwork" device="GPU" vmsize="845226" vmpeak="1103374" vmrss="500051" vmhwm="821986" />
+ <model path="tf/1.14.0/FP32/resnet_v2_50/resnet_v2_50.xml" test="infer_request_inference" device="CPU" vmsize="1063304" vmpeak="1135393" vmrss="271220" vmhwm="364399" />
+ <model path="tf/1.14.0/FP32/resnet_v2_50/resnet_v2_50.xml" test="infer_request_inference" device="GPU" vmsize="1092159" vmpeak="1105997" vmrss="738276" vmhwm="823983" />
+ <model path="tf/1.14.0/FP32/rfcn_resnet101_coco/rfcn_resnet101_coco.xml" test="create_exenetwork" device="CPU" vmsize="838816" vmpeak="1561762" vmrss="116930" vmhwm="752906" />
+ <model path="tf/1.14.0/FP32/rfcn_resnet101_coco/rfcn_resnet101_coco.xml" test="create_exenetwork" device="GPU" vmsize="1674490" vmpeak="2318250" vmrss="1329842" vmhwm="2034986" />
+ <model path="tf/1.14.0/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="create_exenetwork" device="CPU" vmsize="755062" vmpeak="880739" vmrss="28415" vmhwm="43480" />
+ <model path="tf/1.14.0/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="create_exenetwork" device="GPU" vmsize="609298" vmpeak="704044" vmrss="263868" vmhwm="323488" />
+ <model path="tf/1.14.0/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="infer_request_inference" device="CPU" vmsize="825048" vmpeak="897138" vmrss="49108" vmhwm="49108" />
+ <model path="tf/1.14.0/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="infer_request_inference" device="GPU" vmsize="675844" vmpeak="747934" vmrss="322753" vmhwm="322753" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml" test="create_exenetwork" device="CPU" vmsize="756804" vmpeak="978252" vmrss="70514" vmhwm="120370" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml" test="create_exenetwork" device="GPU" vmsize="831318" vmpeak="949744" vmrss="485619" vmhwm="524550" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml" test="infer_request_inference" device="CPU" vmsize="925689" vmpeak="997779" vmrss="130244" vmhwm="130244" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml" test="infer_request_inference" device="GPU" vmsize="878099" vmpeak="950188" vmrss="525395" vmhwm="525395" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco/ssd_mobilenet_v1_fpn_coco.xml" test="create_exenetwork" device="CPU" vmsize="759435" vmpeak="1442861" vmrss="34680" vmhwm="509454" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco/ssd_mobilenet_v1_fpn_coco.xml" test="create_exenetwork" device="GPU" vmsize="1012906" vmpeak="1460487" vmrss="667977" vmhwm="1179833" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco/ssd_mobilenet_v1_fpn_coco.xml" test="infer_request_inference" device="CPU" vmsize="1368043" vmpeak="1442861" vmrss="427737" vmhwm="509533" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco/ssd_mobilenet_v1_fpn_coco.xml" test="infer_request_inference" device="GPU" vmsize="1542648" vmpeak="1542648" vmrss="1195304" vmhwm="1195304" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco_602x602/ssd_mobilenet_v1_fpn_coco_602x602.xml" test="create_exenetwork" device="CPU" vmsize="759558" vmpeak="1426185" vmrss="33862" vmhwm="507768" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco_602x602/ssd_mobilenet_v1_fpn_coco_602x602.xml" test="create_exenetwork" device="GPU" vmsize="1010358" vmpeak="1414454" vmrss="665451" vmhwm="1133941" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco_602x602/ssd_mobilenet_v1_fpn_coco_602x602.xml" test="infer_request_inference" device="CPU" vmsize="1350650" vmpeak="1426185" vmrss="421828" vmhwm="509168" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco_602x602/ssd_mobilenet_v1_fpn_coco_602x602.xml" test="infer_request_inference" device="GPU" vmsize="1493681" vmpeak="1565770" vmrss="1145416" vmhwm="1145416" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v2_coco/ssd_mobilenet_v2_coco.xml" test="create_exenetwork" device="CPU" vmsize="761433" vmpeak="985784" vmrss="41514" vmhwm="254610" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v2_coco/ssd_mobilenet_v2_coco.xml" test="create_exenetwork" device="GPU" vmsize="876933" vmpeak="1078919" vmrss="531814" vmhwm="798001" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v2_coco/ssd_mobilenet_v2_coco.xml" test="infer_request_inference" device="CPU" vmsize="1028508" vmpeak="1064698" vmrss="201212" vmhwm="254390" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v2_coco/ssd_mobilenet_v2_coco.xml" test="infer_request_inference" device="GPU" vmsize="1091807" vmpeak="1163896" vmrss="739525" vmhwm="798023" />
+ <model path="tf/1.14.0/FP32/unet2d/unet2d.xml" test="create_exenetwork" device="CPU" vmsize="754067" vmpeak="1169247" vmrss="15686" vmhwm="429523" />
+ <model path="tf/1.14.0/FP32/unet2d/unet2d.xml" test="create_exenetwork" device="GPU" vmsize="682413" vmpeak="1130109" vmrss="337194" vmhwm="848733" />
+ <model path="tf/1.14.0/FP32/unet2d/unet2d.xml" test="infer_request_inference" device="CPU" vmsize="1106463" vmpeak="1178553" vmrss="321428" vmhwm="429871" />
+ <model path="tf/1.14.0/FP32/unet2d/unet2d.xml" test="infer_request_inference" device="GPU" vmsize="1083904" vmpeak="1155994" vmrss="730976" vmhwm="845882" />
+ <model path="tf/1.14.0/FP32/vgg16/vgg16.xml" test="create_exenetwork" device="CPU" vmsize="754010" vmpeak="2548502" vmrss="15452" vmhwm="1807863" />
+ <model path="tf/1.14.0/FP32/vgg16/vgg16.xml" test="create_exenetwork" device="GPU" vmsize="686602" vmpeak="3327385" vmrss="340982" vmhwm="3045398" />
+ <model path="tf/1.14.0/FP32/vgg16/vgg16.xml" test="infer_request_inference" device="CPU" vmsize="2026776" vmpeak="2548502" vmrss="1241011" vmhwm="1808730" />
+ <model path="tf/1.14.0/FP32/vgg16/vgg16.xml" test="infer_request_inference" device="GPU" vmsize="2438568" vmpeak="3312188" vmrss="2084328" vmhwm="3029980" />
+ <model path="tf/1.14.0/FP32/vgg19/vgg19.xml" test="create_exenetwork" device="CPU" vmsize="754168" vmpeak="2617986" vmrss="16073" vmhwm="1877000" />
+ <model path="tf/1.14.0/FP32/vgg19/vgg19.xml" test="create_exenetwork" device="GPU" vmsize="612194" vmpeak="3415310" vmrss="266732" vmhwm="3133363" />
+ <model path="tf/1.14.0/FP32/vgg19/vgg19.xml" test="infer_request_inference" device="CPU" vmsize="2145479" vmpeak="2617885" vmrss="1287272" vmhwm="1877568" />
+ <model path="tf/1.14.0/FP32/vgg19/vgg19.xml" test="infer_request_inference" device="GPU" vmsize="2521367" vmpeak="3415297" vmrss="2167426" vmhwm="3133059" />
+ <model path="tf/1.14.0/FP32/yolo_v2/yolo_v2.xml" test="create_exenetwork" device="CPU" vmsize="754344" vmpeak="1426625" vmrss="17173" vmhwm="684173" />
+ <model path="tf/1.14.0/FP32/yolo_v2/yolo_v2.xml" test="create_exenetwork" device="GPU" vmsize="684424" vmpeak="1460949" vmrss="339600" vmhwm="1180036" />
+ <model path="tf/1.14.0/FP32/yolo_v2/yolo_v2.xml" test="infer_request_inference" device="CPU" vmsize="1282802" vmpeak="1426625" vmrss="493737" vmhwm="684802" />
+ <model path="tf/1.14.0/FP32/yolo_v2/yolo_v2.xml" test="infer_request_inference" device="GPU" vmsize="1331783" vmpeak="1443006" vmrss="978560" vmhwm="1161124" />
+ <model path="tf/1.14.0/FP32/yolo_v2_tiny_voc/yolo_v2_tiny_voc.xml" test="create_exenetwork" device="CPU" vmsize="753724" vmpeak="954421" vmrss="14414" vmhwm="229578" />
+ <model path="tf/1.14.0/FP32/yolo_v2_tiny_voc/yolo_v2_tiny_voc.xml" test="create_exenetwork" device="GPU" vmsize="569179" vmpeak="816648" vmrss="224250" vmhwm="535449" />
+ <model path="tf/1.14.0/FP32/yolo_v2_tiny_voc/yolo_v2_tiny_voc.xml" test="infer_request_inference" device="CPU" vmsize="960810" vmpeak="960810" vmrss="174231" vmhwm="229807" />
+ <model path="tf/1.14.0/FP32/yolo_v2_tiny_voc/yolo_v2_tiny_voc.xml" test="infer_request_inference" device="GPU" vmsize="808627" vmpeak="880717" vmrss="455677" vmhwm="533002" />
+ <model path="tf/1.14.0/FP32/yolo_v2_voc/yolo_v2_voc.xml" test="create_exenetwork" device="CPU" vmsize="754344" vmpeak="1422647" vmrss="17437" vmhwm="680666" />
+ <model path="tf/1.14.0/FP32/yolo_v2_voc/yolo_v2_voc.xml" test="create_exenetwork" device="GPU" vmsize="686316" vmpeak="1436296" vmrss="340586" vmhwm="1154617" />
+ <model path="tf/1.14.0/FP32/yolo_v2_voc/yolo_v2_voc.xml" test="infer_request_inference" device="CPU" vmsize="1279797" vmpeak="1422616" vmrss="490982" vmhwm="680147" />
+ <model path="tf/1.14.0/FP32/yolo_v2_voc/yolo_v2_voc.xml" test="infer_request_inference" device="GPU" vmsize="1330780" vmpeak="1442570" vmrss="978392" vmhwm="1161490" />
+ <model path="tf/1.14.0/FP32/yolo_v3/yolo_v3.xml" test="create_exenetwork" device="CPU" vmsize="756958" vmpeak="1587260" vmrss="31108" vmhwm="836506" />
+ <model path="tf/1.14.0/FP32/yolo_v3/yolo_v3.xml" test="create_exenetwork" device="GPU" vmsize="1163712" vmpeak="1824596" vmrss="819011" vmhwm="1543559" />
+ <model path="tf/1.14.0/FP32/yolo_v3/yolo_v3.xml" test="infer_request_inference" device="CPU" vmsize="1405879" vmpeak="1591766" vmrss="610302" vmhwm="836594" />
+ <model path="tf/1.14.0/FP32/yolo_v3/yolo_v3.xml" test="infer_request_inference" device="GPU" vmsize="1734233" vmpeak="1823470" vmrss="1381925" vmhwm="1542178" />
+ <model path="tf/1.14.0/FP32/yolo_v3_tiny/yolo_v3_tiny.xml" test="create_exenetwork" device="CPU" vmsize="753975" vmpeak="895633" vmrss="15637" vmhwm="140927" />
+ <model path="tf/1.14.0/FP32/yolo_v3_tiny/yolo_v3_tiny.xml" test="create_exenetwork" device="GPU" vmsize="599332" vmpeak="728939" vmrss="254029" vmhwm="412566" />
+ <model path="tf/1.14.0/FP32/yolo_v3_tiny/yolo_v3_tiny.xml" test="infer_request_inference" device="CPU" vmsize="903469" vmpeak="975559" vmrss="116124" vmhwm="141182" />
+ <model path="tf/1.14.0/FP32/yolo_v3_tiny/yolo_v3_tiny.xml" test="infer_request_inference" device="GPU" vmsize="741738" vmpeak="813828" vmrss="389259" vmhwm="413476" />
+ </models>
+</attributes>
\ No newline at end of file
--- /dev/null
+<?xml version="1.0"?>
+<attributes>
+ <devices>
+ <value>CPU</value>
+ <value>GPU</value>
+ </devices>
+ <models>
+ <value>caffe/FP32/alexnet/alexnet.xml</value>
+ <value>caffe/FP32/caffenet/caffenet.xml</value>
+ <value>caffe/FP32/densenet_121/densenet_121.xml</value>
+ <value>caffe/FP32/densenet_161/densenet_161.xml</value>
+ <value>caffe/FP32/densenet_169/densenet_169.xml</value>
+ <value>caffe/FP32/densenet_201/densenet_201.xml</value>
+ <value>caffe/FP32/dpn_92/dpn_92.xml</value>
+ <value>caffe/FP32/inception_resnet_v2/inception_resnet_v2.xml</value>
+ <value>caffe/FP32/inception_v1/inception_v1.xml</value>
+ <value>caffe/FP32/inception_v2/inception_v2.xml</value>
+ <value>caffe/FP32/inception_v3/inception_v3.xml</value>
+ <value>caffe/FP32/inception_v4/inception_v4.xml</value>
+ <value>caffe/FP32/lenet/lenet.xml</value>
+ <value>caffe/FP32/mobilenet/mobilenet.xml</value>
+ <value>caffe/FP32/mobilenet_v2/mobilenet_v2.xml</value>
+ <value>caffe/FP32/resnet_18/resnet_18.xml</value>
+ <value>caffe/FP32/resnet_v1_50/resnet_v1_50.xml</value>
+ <value>caffe/FP32/resnet_v1_101/resnet_v1_101.xml</value>
+ <value>caffe/FP32/resnet_v1_152/resnet_v1_152.xml</value>
+ <value>caffe/FP32/resnet_v1_269/resnet_v1_269.xml</value>
+ <value>caffe/FP32/se_resnext_50/se_resnext_50.xml</value>
+ <value>caffe/FP32/squeezenet_v1.0/squeezenet_v1.0.xml</value>
+ <value>caffe/FP32/squeezenet_v1.1/squeezenet_v1.1.xml</value>
+ <value>caffe/FP32/ssd_googlenet/ssd_googlenet.xml</value>
+ <value>caffe/FP32/ssd_squeezenet/ssd_squeezenet.xml</value>
+ <value>caffe/FP32/ssd_mobilenet/ssd_mobilenet.xml</value>
+ <value>caffe/FP32/ssd_vgg16_300/ssd_vgg16_300.xml</value>
+ <value>caffe/FP32/ssd_vgg16_512/ssd_vgg16_512.xml</value>
+ <value>caffe/FP32/vgg16/vgg16.xml</value>
+ <value>caffe/FP32/vgg19/vgg19.xml</value>
+ <value>caffe/FP32/wrn_50_2/wrn_50_2.xml</value>
+ <value>caffe/FP32/yolo_v1_full/yolo_v1_full.xml</value>
+ <value>caffe/FP32/yolo_v1_tiny/yolo_v1_tiny.xml</value>
+ <value>caffe/FP32/yolo_v2/yolo_v2.xml</value>
+ <value>caffe/FP32/yolo_v2_tiny/yolo_v2_tiny.xml</value>
+ <value>caffe/FP32/yolo_v3/yolo_v3.xml</value>
+ <value>caffe/FP32/dilation/dilation.xml</value>
+ <value>caffe/FP32/dssd/dssd.xml</value>
+ <value>caffe/FP32/fcn8/fcn8.xml</value>
+ <value>caffe/FP32/fcn32/fcn32.xml</value>
+ <value>caffe/FP32/fcn_alexnet/fcn_alexnet.xml</value>
+ <value>caffe/FP32/mtcnn_p/mtcnn_p.xml</value>
+ <value>caffe/FP32/mtcnn_r/mtcnn_r.xml</value>
+ <value>caffe/FP32/mtcnn_o/mtcnn_o.xml</value>
+ <value>caffe/FP32/openpose_face/openpose_face.xml</value>
+ <value>caffe/FP32/openpose_hand/openpose_hand.xml</value>
+ <value>caffe/FP32/openpose_pose_coco/openpose_pose_coco.xml</value>
+ <value>caffe/FP32/places205_alexnet/places205_alexnet.xml</value>
+ <value>caffe/FP32/places205_googlenet/places205_googlenet.xml</value>
+ <value>caffe/FP32/se_bn_inception/se_bn_inception.xml</value>
+ <value>caffe/FP32/vnect/vnect.xml</value>
+ <value>tf/1.14.0/FP32/bert_base_uncased/bert_base_uncased.xml</value>
+ <value>tf/1.14.0/FP32/bert_xnli/bert_xnli.xml</value>
+ <value>tf/1.14.0/FP32/cmu/cmu.xml</value>
+ <value>tf/1.14.0/FP32/densenet_121/densenet_121.xml</value>
+ <value>tf/1.14.0/FP32/densenet_169/densenet_169.xml</value>
+ <value>tf/1.14.0/FP32/deeplab_v3/deeplab_v3.xml</value>
+ <value>tf/1.14.0/FP32/east/east.xml</value>
+ <value>tf/1.14.0/FP32/facenet/facenet.xml</value>
+ <value>tf/1.14.0/FP32/faster_rcnn_inception_v2_coco/faster_rcnn_inception_v2_coco.xml</value>
+ <value>tf/1.14.0/FP32/faster_rcnn_inception_resnet_v2_atrous_coco/faster_rcnn_inception_resnet_v2_atrous_coco.xml</value>
+ <value>tf/1.14.0/FP32/faster_rcnn_resnet50_coco/faster_rcnn_resnet50_coco.xml</value>
+ <value>tf/1.14.0/FP32/faster_rcnn_resnet101_coco/faster_rcnn_resnet101_coco.xml</value>
+ <value>tf/1.14.0/FP32/gnmt/gnmt.xml</value>
+ <value>tf/1.14.0/FP32/i3d_rgb/i3d_rgb.xml</value>
+ <value>tf/1.14.0/FP32/inception_v1/inception_v1.xml</value>
+ <value>tf/1.14.0/FP32/inception_v2/inception_v2.xml</value>
+ <value>tf/1.14.0/FP32/inception_v3/inception_v3.xml</value>
+ <value>tf/1.14.0/FP32/inception_v4/inception_v4.xml</value>
+ <value>tf/1.14.0/FP32/inception_resnet_v2/inception_resnet_v2.xml</value>
+ <value>tf/1.14.0/FP32/mask_rcnn_resnet101_atrous_coco/mask_rcnn_resnet101_atrous_coco.xml</value>
+ <value>tf/1.14.0/FP32/mobilenet_v1_0.25_128/mobilenet_v1_0.25_128.xml</value>
+ <value>tf/1.14.0/FP32/mobilenet_v1_0.5_160/mobilenet_v1_0.5_160.xml</value>
+ <value>tf/1.14.0/FP32/mobilenet_v1_1.0_224/mobilenet_v1_1.0_224.xml</value>
+ <value>tf/1.14.0/FP32/mobilenet_v2_1.0_224/mobilenet_v2_1.0_224.xml</value>
+ <value>tf/1.14.0/FP32/mobilenet_v2_1.4_224/mobilenet_v2_1.4_224.xml</value>
+ <value>tf/1.14.0/FP32/ncf/ncf.xml</value>
+ <value>tf/1.14.0/FP32/nasnet-a_large/nasnet-a_large.xml</value>
+ <value>tf/1.14.0/FP32/nasnet-a_mobile/nasnet-a_mobile.xml</value>
+ <value>tf/1.14.0/FP32/pnasnet-5_large/pnasnet-5_large.xml</value>
+ <value>tf/1.14.0/FP32/resnet_v1_50/resnet_v1_50.xml</value>
+ <value>tf/1.14.0/FP32/resnet_v1.5_50/resnet_v1.5_50.xml</value>
+ <value>tf/1.14.0/FP32/resnet_v1_101/resnet_v1_101.xml</value>
+ <value>tf/1.14.0/FP32/resnet_v1_152/resnet_v1_152.xml</value>
+ <value>tf/1.14.0/FP32/resnet_v2_50/resnet_v2_50.xml</value>
+ <value>tf/1.14.0/FP32/resnet_v2_101/resnet_v2_101.xml</value>
+ <value>tf/1.14.0/FP32/resnet_v2_152/resnet_v2_152.xml</value>
+ <value>tf/1.14.0/FP32/rfcn_resnet101_coco/rfcn_resnet101_coco.xml</value>
+ <value>tf/1.14.0/FP32/squeezenet_v1.1/squeezenet_v1.1.xml</value>
+ <value>tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml</value>
+ <value>tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco/ssd_mobilenet_v1_fpn_coco.xml</value>
+ <value>tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco_602x602/ssd_mobilenet_v1_fpn_coco_602x602.xml</value>
+ <value>tf/1.14.0/FP32/ssd_mobilenet_v2_coco/ssd_mobilenet_v2_coco.xml</value>
+ <value>tf/1.14.0/FP32/unet2d/unet2d.xml</value>
+ <value>tf/1.14.0/FP32/vgg16/vgg16.xml</value>
+ <value>tf/1.14.0/FP32/vgg19/vgg19.xml</value>
+ <value>tf/1.14.0/FP32/yolo_v2/yolo_v2.xml</value>
+ <value>tf/1.14.0/FP32/yolo_v2_voc/yolo_v2_voc.xml</value>
+ <value>tf/1.14.0/FP32/yolo_v2_tiny_voc/yolo_v2_tiny_voc.xml</value>
+ <value>tf/1.14.0/FP32/yolo_v3/yolo_v3.xml</value>
+ <value>tf/1.14.0/FP32/yolo_v3_tiny/yolo_v3_tiny.xml</value>
+ <value>tf/1.14.0/FP32/dssd_avigilon/dssd_avigilon.xml</value>
+ <value>tf/1.14.0/FP32/icv_squeezenet_v1.0/icv_squeezenet_v1.0.xml</value>
+ <value>tf/1.14.0/FP32/icv_squeezenet_v1.1/icv_squeezenet_v1.1.xml</value>
+ <value>mxnet/FP32/caffenet/caffenet.xml</value>
+ <value>mxnet/FP32/densenet_121/densenet_121.xml</value>
+ <value>mxnet/FP32/densenet_161/densenet_161.xml</value>
+ <value>mxnet/FP32/densenet_169/densenet_169.xml</value>
+ <value>mxnet/FP32/densenet_201/densenet_201.xml</value>
+ <value>mxnet/FP32/inception_v3/inception_v3.xml</value>
+ <value>mxnet/FP32/inception_v4/inception_v4.xml</value>
+ <value>mxnet/FP32/mobilenet/mobilenet.xml</value>
+ <value>mxnet/FP32/mobilenet_v2/mobilenet_v2.xml</value>
+ <value>mxnet/FP32/resnet_v1_101/resnet_v1_101.xml</value>
+ <value>mxnet/FP32/resnet_v1_152/resnet_v1_152.xml</value>
+ <value>mxnet/FP32/resnet_v2_101/resnet_v2_101.xml</value>
+ <value>mxnet/FP32/resnet_v2_152/resnet_v2_152.xml</value>
+ <value>mxnet/FP32/resnext_101/resnext_101.xml</value>
+ <value>mxnet/FP32/squeezenet_v1.1/squeezenet_v1.1.xml</value>
+ <value>mxnet/FP32/ssd_inception_v3_512/ssd_inception_v3_512.xml</value>
+ <value>mxnet/FP32/ssd_mobilenet_512/ssd_mobilenet_512.xml</value>
+ <value>mxnet/FP32/ssd_resnet50_512/ssd_resnet50_512.xml</value>
+ <value>mxnet/FP32/ssd_vgg16_300/ssd_vgg16_300.xml</value>
+ <value>mxnet/FP32/vgg16/vgg16.xml</value>
+ <value>mxnet/FP32/vgg19/vgg19.xml</value>
+ <value>mxnet/FP32/dpn_92/dpn_92.xml</value>
+ <value>mxnet/FP32/fcn8s_vgg16/fcn8s_vgg16.xml</value>
+ <value>mxnet/FP32/full_imagenet_network/full_imagenet_network.xml</value>
+ <value>mxnet/FP32/inception_resnet_v2/inception_resnet_v2.xml</value>
+ <value>mxnet/FP32/inception_v3_no_batchnorm/inception_v3_no_batchnorm.xml</value>
+ <value>mxnet/FP32/location_net/location_net.xml</value>
+ <value>mxnet/FP32/lresnet100e/lresnet100e.xml</value>
+ <value>mxnet/FP32/mtcnn_p/mtcnn_p.xml</value>
+ <value>mxnet/FP32/mtcnn_r/mtcnn_r.xml</value>
+ <value>mxnet/FP32/mtcnn_o/mtcnn_o.xml</value>
+ <value>mxnet/FP32/nin/nin.xml</value>
+ <value>mxnet/FP32/nst_vgg19/nst_vgg19.xml</value>
+ <value>mxnet/FP32/resnext_101_64x4d/resnext_101_64x4d.xml</value>
+ <value>mxnet/FP32/yolo_v1_full/yolo_v1_full.xml</value>
+ <value>mxnet/FP32/yolo_v1_tiny/yolo_v1_tiny.xml</value>
+ <value>onnx/FP32/ssd_resnet34/ssd_resnet34.xml</value>
+ <value>onnx/FP32/ssd_resnet34_new/ssd_resnet34_new.xml</value>
+ <value>onnx/FP32/retina_net/retina_net.xml</value>
+ <value>pytorch/FP32/inceptionv3_pretrained/inceptionv3_pretrained.xml</value>
+ <value>pytorch/FP32/resnet50_pretrained/resnet50_pretrained.xml</value>
+ <value>pytorch/FP32/squeezenet_v1.1_pretrained/squeezenet_v1.1_pretrained.xml</value>
+ <value>pytorch/FP32/resnet50_torchvision/resnet50_torchvision.xml</value>
+ </models>
+</attributes>
\ No newline at end of file
--- /dev/null
+<?xml version="1.0"?>
+<attributes>
+ <models>
+ <model path="caffe/FP32/alexnet/alexnet.xml" test="create_exenetwork" device="CPU" vmsize="753847" vmpeak="1528832" vmrss="14005" vmhwm="814655" />
+ <model path="caffe/FP32/alexnet/alexnet.xml" test="create_exenetwork" device="GPU" vmsize="580025" vmpeak="1743759" vmrss="234704" vmhwm="1462062" />
+ <model path="caffe/FP32/alexnet/alexnet.xml" test="infer_request_inference" device="CPU" vmsize="1339971" vmpeak="1528828" vmrss="555262" vmhwm="814805" />
+ <model path="caffe/FP32/alexnet/alexnet.xml" test="infer_request_inference" device="GPU" vmsize="1389159" vmpeak="1741154" vmrss="1036169" vmhwm="1460052" />
+ <model path="caffe/FP32/caffenet/caffenet.xml" test="create_exenetwork" device="CPU" vmsize="753843" vmpeak="1545451" vmrss="14234" vmhwm="821334" />
+ <model path="caffe/FP32/caffenet/caffenet.xml" test="create_exenetwork" device="GPU" vmsize="602206" vmpeak="1511325" vmrss="257501" vmhwm="1230284" />
+ <model path="caffe/FP32/caffenet/caffenet.xml" test="infer_request_inference" device="CPU" vmsize="1368206" vmpeak="1545456" vmrss="576774" vmhwm="821739" />
+ <model path="caffe/FP32/caffenet/caffenet.xml" test="infer_request_inference" device="GPU" vmsize="1423096" vmpeak="1511373" vmrss="1074752" vmhwm="1230732" />
+ <model path="caffe/FP32/densenet_121/densenet_121.xml" test="create_exenetwork" device="CPU" vmsize="772626" vmpeak="985754" vmrss="95260" vmhwm="151496" />
+ <model path="caffe/FP32/densenet_121/densenet_121.xml" test="create_exenetwork" device="GPU" vmsize="1044604" vmpeak="1154709" vmrss="699168" vmhwm="811104" />
+ <model path="caffe/FP32/densenet_121/densenet_121.xml" test="infer_request_inference" device="CPU" vmsize="985525" vmpeak="1057614" vmrss="159306" vmhwm="159306" />
+ <model path="caffe/FP32/densenet_121/densenet_121.xml" test="infer_request_inference" device="GPU" vmsize="1163289" vmpeak="1235379" vmrss="812961" vmhwm="812961" />
+ <model path="caffe/FP32/densenet_161/densenet_161.xml" test="create_exenetwork" device="CPU" vmsize="762770" vmpeak="1212248" vmrss="93570" vmhwm="426817" />
+ <model path="caffe/FP32/densenet_161/densenet_161.xml" test="create_exenetwork" device="GPU" vmsize="1127847" vmpeak="1586310" vmrss="782029" vmhwm="1304679" />
+ <model path="caffe/FP32/densenet_161/densenet_161.xml" test="infer_request_inference" device="CPU" vmsize="1351816" vmpeak="1423906" vmrss="353738" vmhwm="427644" />
+ <model path="caffe/FP32/densenet_161/densenet_161.xml" test="infer_request_inference" device="GPU" vmsize="1660304" vmpeak="1660304" vmrss="1309215" vmhwm="1309215" />
+ <model path="caffe/FP32/densenet_169/densenet_169.xml" test="create_exenetwork" device="CPU" vmsize="791863" vmpeak="998329" vmrss="123059" vmhwm="240160" />
+ <model path="caffe/FP32/densenet_169/densenet_169.xml" test="create_exenetwork" device="GPU" vmsize="1309598" vmpeak="1428944" vmrss="964066" vmhwm="1086751" />
+ <model path="caffe/FP32/densenet_169/densenet_169.xml" test="infer_request_inference" device="CPU" vmsize="1060303" vmpeak="1132392" vmrss="238924" vmhwm="240416" />
+ <model path="caffe/FP32/densenet_169/densenet_169.xml" test="infer_request_inference" device="GPU" vmsize="1435214" vmpeak="1507303" vmrss="1084969" vmhwm="1084969" />
+ <model path="caffe/FP32/densenet_201/densenet_201.xml" test="create_exenetwork" device="CPU" vmsize="864639" vmpeak="1153900" vmrss="147906" vmhwm="322590" />
+ <model path="caffe/FP32/densenet_201/densenet_201.xml" test="create_exenetwork" device="GPU" vmsize="1541161" vmpeak="1686282" vmrss="1195972" vmhwm="1337595" />
+ <model path="caffe/FP32/densenet_201/densenet_201.xml" test="infer_request_inference" device="CPU" vmsize="1181479" vmpeak="1253568" vmrss="315581" vmhwm="322700" />
+ <model path="caffe/FP32/densenet_201/densenet_201.xml" test="infer_request_inference" device="GPU" vmsize="1706760" vmpeak="1778849" vmrss="1356533" vmhwm="1356533" />
+ <model path="caffe/FP32/dilation/dilation.xml" test="create_exenetwork" device="CPU" vmsize="754428" vmpeak="3004311" vmrss="17613" vmhwm="1856210" />
+ <model path="caffe/FP32/dilation/dilation.xml" test="create_exenetwork" device="GPU" vmsize="710569" vmpeak="3363879" vmrss="365380" vmhwm="3081751" />
+ <model path="caffe/FP32/dilation/dilation.xml" test="infer_request_inference" device="CPU" vmsize="2487130" vmpeak="3004311" vmrss="1687936" vmhwm="1856448" />
+ <model path="caffe/FP32/dilation/dilation.xml" test="infer_request_inference" device="GPU" vmsize="2951748" vmpeak="3363804" vmrss="2597940" vmhwm="3080968" />
+ <model path="caffe/FP32/dpn_92/dpn_92.xml" test="create_exenetwork" device="CPU" vmsize="767157" vmpeak="1369376" vmrss="63338" vmhwm="540166" />
+ <model path="caffe/FP32/dpn_92/dpn_92.xml" test="create_exenetwork" device="GPU" vmsize="1155101" vmpeak="1701180" vmrss="809938" vmhwm="1420152" />
+ <model path="caffe/FP32/dpn_92/dpn_92.xml" test="infer_request_inference" device="CPU" vmsize="1299262" vmpeak="1373882" vmrss="431758" vmhwm="540214" />
+ <model path="caffe/FP32/dpn_92/dpn_92.xml" test="infer_request_inference" device="GPU" vmsize="1647738" vmpeak="1719828" vmrss="1296350" vmhwm="1419092" />
+ <model path="caffe/FP32/fcn_alexnet/fcn_alexnet.xml" test="create_exenetwork" device="CPU" vmsize="753711" vmpeak="1642832" vmrss="14014" vmhwm="789109" />
+ <model path="caffe/FP32/fcn_alexnet/fcn_alexnet.xml" test="create_exenetwork" device="GPU" vmsize="595430" vmpeak="1690484" vmrss="250496" vmhwm="1409205" />
+ <model path="caffe/FP32/fcn_alexnet/fcn_alexnet.xml" test="infer_request_inference" device="CPU" vmsize="1494464" vmpeak="1642832" vmrss="679214" vmhwm="789412" />
+ <model path="caffe/FP32/fcn_alexnet/fcn_alexnet.xml" test="infer_request_inference" device="GPU" vmsize="1450746" vmpeak="1693172" vmrss="1097681" vmhwm="1412254" />
+ <model path="caffe/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="create_exenetwork" device="CPU" vmsize="919740" vmpeak="1521955" vmrss="234520" vmhwm="792022" />
+ <model path="caffe/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="create_exenetwork" device="GPU" vmsize="1666363" vmpeak="2175012" vmrss="1321245" vmhwm="1893936" />
+ <model path="caffe/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="infer_request_inference" device="CPU" vmsize="1436982" vmpeak="1521955" vmrss="643614" vmhwm="793218" />
+ <model path="caffe/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="infer_request_inference" device="GPU" vmsize="2138818" vmpeak="2210907" vmrss="1786162" vmhwm="1893760" />
+ <model path="caffe/FP32/inception_v1/inception_v1.xml" test="create_exenetwork" device="CPU" vmsize="757262" vmpeak="978832" vmrss="81408" vmhwm="124238" />
+ <model path="caffe/FP32/inception_v1/inception_v1.xml" test="create_exenetwork" device="GPU" vmsize="810590" vmpeak="929139" vmrss="464868" vmhwm="503813" />
+ <model path="caffe/FP32/inception_v1/inception_v1.xml" test="infer_request_inference" device="CPU" vmsize="928637" vmpeak="1000727" vmrss="130719" vmhwm="130719" />
+ <model path="caffe/FP32/inception_v1/inception_v1.xml" test="infer_request_inference" device="GPU" vmsize="859478" vmpeak="931568" vmrss="507540" vmhwm="507540" />
+ <model path="caffe/FP32/inception_v2/inception_v2.xml" test="create_exenetwork" device="CPU" vmsize="766726" vmpeak="925245" vmrss="33382" vmhwm="180268" />
+ <model path="caffe/FP32/inception_v2/inception_v2.xml" test="create_exenetwork" device="GPU" vmsize="775117" vmpeak="913347" vmrss="430157" vmhwm="605598" />
+ <model path="caffe/FP32/inception_v2/inception_v2.xml" test="infer_request_inference" device="CPU" vmsize="927163" vmpeak="999253" vmrss="141869" vmhwm="181156" />
+ <model path="caffe/FP32/inception_v2/inception_v2.xml" test="infer_request_inference" device="GPU" vmsize="924752" vmpeak="996842" vmrss="571590" vmhwm="602839" />
+ <model path="caffe/FP32/inception_v3/inception_v3.xml" test="create_exenetwork" device="CPU" vmsize="767003" vmpeak="1090526" vmrss="34900" vmhwm="348172" />
+ <model path="caffe/FP32/inception_v3/inception_v3.xml" test="create_exenetwork" device="GPU" vmsize="948046" vmpeak="1182082" vmrss="602624" vmhwm="900169" />
+ <model path="caffe/FP32/inception_v3/inception_v3.xml" test="infer_request_inference" device="CPU" vmsize="1051481" vmpeak="1123570" vmrss="257219" vmhwm="348541" />
+ <model path="caffe/FP32/inception_v3/inception_v3.xml" test="infer_request_inference" device="GPU" vmsize="1187106" vmpeak="1259196" vmrss="834438" vmhwm="902800" />
+ <model path="caffe/FP32/inception_v4/inception_v4.xml" test="create_exenetwork" device="CPU" vmsize="764315" vmpeak="1326938" vmrss="63725" vmhwm="603213" />
+ <model path="caffe/FP32/inception_v4/inception_v4.xml" test="create_exenetwork" device="GPU" vmsize="1183410" vmpeak="1680448" vmrss="837953" vmhwm="1398870" />
+ <model path="caffe/FP32/inception_v4/inception_v4.xml" test="infer_request_inference" device="CPU" vmsize="1227798" vmpeak="1326908" vmrss="438160" vmhwm="602434" />
+ <model path="caffe/FP32/inception_v4/inception_v4.xml" test="infer_request_inference" device="GPU" vmsize="1633997" vmpeak="1706086" vmrss="1281693" vmhwm="1395878" />
+ <model path="caffe/FP32/lenet/lenet.xml" test="create_exenetwork" device="CPU" vmsize="753605" vmpeak="876330" vmrss="15571" vmhwm="29106" />
+ <model path="caffe/FP32/lenet/lenet.xml" test="create_exenetwork" device="GPU" vmsize="566693" vmpeak="658486" vmrss="220783" vmhwm="232452" />
+ <model path="caffe/FP32/lenet/lenet.xml" test="infer_request_inference" device="CPU" vmsize="808486" vmpeak="880576" vmrss="29084" vmhwm="29084" />
+ <model path="caffe/FP32/lenet/lenet.xml" test="infer_request_inference" device="GPU" vmsize="586401" vmpeak="658490" vmrss="232764" vmhwm="232764" />
+ <model path="caffe/FP32/mobilenet/mobilenet.xml" test="create_exenetwork" device="CPU" vmsize="754864" vmpeak="893692" vmrss="54617" vmhwm="81584" />
+ <model path="caffe/FP32/mobilenet/mobilenet.xml" test="create_exenetwork" device="GPU" vmsize="642527" vmpeak="750424" vmrss="296678" vmhwm="362300" />
+ <model path="caffe/FP32/mobilenet/mobilenet.xml" test="infer_request_inference" device="CPU" vmsize="831336" vmpeak="903425" vmrss="85654" vmhwm="85654" />
+ <model path="caffe/FP32/mobilenet/mobilenet.xml" test="infer_request_inference" device="GPU" vmsize="716047" vmpeak="788136" vmrss="364434" vmhwm="364434" />
+ <model path="caffe/FP32/mobilenet_v2/mobilenet_v2.xml" test="create_exenetwork" device="CPU" vmsize="756813" vmpeak="819698" vmrss="54410" vmhwm="78289" />
+ <model path="caffe/FP32/mobilenet_v2/mobilenet_v2.xml" test="create_exenetwork" device="GPU" vmsize="758705" vmpeak="862466" vmrss="412966" vmhwm="437131" />
+ <model path="caffe/FP32/mobilenet_v2/mobilenet_v2.xml" test="infer_request_inference" device="CPU" vmsize="840967" vmpeak="840967" vmrss="82860" vmhwm="82860" />
+ <model path="caffe/FP32/mobilenet_v2/mobilenet_v2.xml" test="infer_request_inference" device="GPU" vmsize="787182" vmpeak="859271" vmrss="436801" vmhwm="436801" />
+ <model path="caffe/FP32/mtcnn_o/mtcnn_o.xml" test="create_exenetwork" device="CPU" vmsize="753715" vmpeak="876299" vmrss="17512" vmhwm="28402" />
+ <model path="caffe/FP32/mtcnn_o/mtcnn_o.xml" test="create_exenetwork" device="GPU" vmsize="583092" vmpeak="674744" vmrss="238220" vmhwm="249722" />
+ <model path="caffe/FP32/mtcnn_o/mtcnn_o.xml" test="infer_request_inference" device="CPU" vmsize="808209" vmpeak="808209" vmrss="27865" vmhwm="27865" />
+ <model path="caffe/FP32/mtcnn_o/mtcnn_o.xml" test="infer_request_inference" device="GPU" vmsize="600714" vmpeak="672804" vmrss="246967" vmhwm="246967" />
+ <model path="caffe/FP32/mtcnn_p/mtcnn_p.xml" test="create_exenetwork" device="CPU" vmsize="763677" vmpeak="874535" vmrss="13318" vmhwm="35327" />
+ <model path="caffe/FP32/mtcnn_p/mtcnn_p.xml" test="create_exenetwork" device="GPU" vmsize="570521" vmpeak="662182" vmrss="224774" vmhwm="351410" />
+ <model path="caffe/FP32/mtcnn_p/mtcnn_p.xml" test="infer_request_inference" device="CPU" vmsize="901260" vmpeak="973350" vmrss="108037" vmhwm="108037" />
+ <model path="caffe/FP32/mtcnn_p/mtcnn_p.xml" test="infer_request_inference" device="GPU" vmsize="685115" vmpeak="757204" vmrss="331421" vmhwm="351529" />
+ <model path="caffe/FP32/mtcnn_r/mtcnn_r.xml" test="create_exenetwork" device="CPU" vmsize="753711" vmpeak="803228" vmrss="14806" vmhwm="25911" />
+ <model path="caffe/FP32/mtcnn_r/mtcnn_r.xml" test="create_exenetwork" device="GPU" vmsize="577280" vmpeak="667673" vmrss="232029" vmhwm="242580" />
+ <model path="caffe/FP32/mtcnn_r/mtcnn_r.xml" test="infer_request_inference" device="CPU" vmsize="806102" vmpeak="806102" vmrss="25352" vmhwm="25352" />
+ <model path="caffe/FP32/mtcnn_r/mtcnn_r.xml" test="infer_request_inference" device="GPU" vmsize="593340" vmpeak="665429" vmrss="240200" vmhwm="240200" />
+ <model path="caffe/FP32/openpose_face/openpose_face.xml" test="create_exenetwork" device="CPU" vmsize="764711" vmpeak="1279238" vmrss="23544" vmhwm="528431" />
+ <model path="caffe/FP32/openpose_face/openpose_face.xml" test="create_exenetwork" device="GPU" vmsize="890428" vmpeak="1316884" vmrss="544882" vmhwm="1035192" />
+ <model path="caffe/FP32/openpose_face/openpose_face.xml" test="infer_request_inference" device="CPU" vmsize="1187529" vmpeak="1279207" vmrss="398512" vmhwm="528730" />
+ <model path="caffe/FP32/openpose_face/openpose_face.xml" test="infer_request_inference" device="GPU" vmsize="1288707" vmpeak="1360796" vmrss="935778" vmhwm="1038888" />
+ <model path="caffe/FP32/openpose_hand/openpose_hand.xml" test="create_exenetwork" device="CPU" vmsize="755634" vmpeak="1259024" vmrss="23342" vmhwm="507980" />
+ <model path="caffe/FP32/openpose_hand/openpose_hand.xml" test="create_exenetwork" device="GPU" vmsize="845886" vmpeak="1297898" vmrss="500957" vmhwm="1016822" />
+ <model path="caffe/FP32/openpose_hand/openpose_hand.xml" test="infer_request_inference" device="CPU" vmsize="1327246" vmpeak="1327246" vmrss="384634" vmhwm="507522" />
+ <model path="caffe/FP32/openpose_hand/openpose_hand.xml" test="infer_request_inference" device="GPU" vmsize="1277117" vmpeak="1300490" vmrss="923674" vmhwm="1018956" />
+ <model path="caffe/FP32/openpose_pose_coco/openpose_pose_coco.xml" test="create_exenetwork" device="CPU" vmsize="757556" vmpeak="1471373" vmrss="32780" vmhwm="716861" />
+ <model path="caffe/FP32/openpose_pose_coco/openpose_pose_coco.xml" test="create_exenetwork" device="GPU" vmsize="1153103" vmpeak="1684306" vmrss="807426" vmhwm="1402513" />
+ <model path="caffe/FP32/openpose_pose_coco/openpose_pose_coco.xml" test="infer_request_inference" device="CPU" vmsize="1397686" vmpeak="1471373" vmrss="528620" vmhwm="717728" />
+ <model path="caffe/FP32/openpose_pose_coco/openpose_pose_coco.xml" test="infer_request_inference" device="GPU" vmsize="1597785" vmpeak="1680465" vmrss="1244672" vmhwm="1399217" />
+ <model path="caffe/FP32/places205_alexnet/places205_alexnet.xml" test="create_exenetwork" device="CPU" vmsize="753711" vmpeak="1485853" vmrss="14330" vmhwm="773766" />
+ <model path="caffe/FP32/places205_alexnet/places205_alexnet.xml" test="create_exenetwork" device="GPU" vmsize="604573" vmpeak="1684861" vmrss="259556" vmhwm="1403600" />
+ <model path="caffe/FP32/places205_alexnet/places205_alexnet.xml" test="infer_request_inference" device="CPU" vmsize="1311107" vmpeak="1485862" vmrss="528448" vmhwm="773656" />
+ <model path="caffe/FP32/places205_alexnet/places205_alexnet.xml" test="infer_request_inference" device="GPU" vmsize="1346840" vmpeak="1684896" vmrss="993942" vmhwm="1403886" />
+ <model path="caffe/FP32/places205_googlenet/places205_googlenet.xml" test="create_exenetwork" device="CPU" vmsize="757187" vmpeak="831362" vmrss="78795" vmhwm="113814" />
+ <model path="caffe/FP32/places205_googlenet/places205_googlenet.xml" test="create_exenetwork" device="GPU" vmsize="805270" vmpeak="920321" vmrss="460319" vmhwm="495638" />
+ <model path="caffe/FP32/places205_googlenet/places205_googlenet.xml" test="infer_request_inference" device="CPU" vmsize="852781" vmpeak="852781" vmrss="119033" vmhwm="119033" />
+ <model path="caffe/FP32/places205_googlenet/places205_googlenet.xml" test="infer_request_inference" device="GPU" vmsize="847052" vmpeak="919142" vmrss="494916" vmhwm="494916" />
+ <model path="caffe/FP32/resnet_18/resnet_18.xml" test="create_exenetwork" device="CPU" vmsize="754248" vmpeak="925443" vmrss="16878" vmhwm="177663" />
+ <model path="caffe/FP32/resnet_18/resnet_18.xml" test="create_exenetwork" device="GPU" vmsize="657659" vmpeak="799510" vmrss="312070" vmhwm="466153" />
+ <model path="caffe/FP32/resnet_18/resnet_18.xml" test="infer_request_inference" device="CPU" vmsize="920163" vmpeak="920163" vmrss="131859" vmhwm="176726" />
+ <model path="caffe/FP32/resnet_18/resnet_18.xml" test="infer_request_inference" device="GPU" vmsize="775350" vmpeak="847440" vmrss="422919" vmhwm="467610" />
+ <model path="caffe/FP32/resnet_v1_101/resnet_v1_101.xml" test="create_exenetwork" device="CPU" vmsize="760584" vmpeak="1338202" vmrss="43243" vmhwm="616928" />
+ <model path="caffe/FP32/resnet_v1_101/resnet_v1_101.xml" test="create_exenetwork" device="GPU" vmsize="1104862" vmpeak="1557006" vmrss="759030" vmhwm="1275071" />
+ <model path="caffe/FP32/resnet_v1_101/resnet_v1_101.xml" test="infer_request_inference" device="CPU" vmsize="1224172" vmpeak="1338172" vmrss="434944" vmhwm="616849" />
+ <model path="caffe/FP32/resnet_v1_101/resnet_v1_101.xml" test="infer_request_inference" device="GPU" vmsize="1452145" vmpeak="1558106" vmrss="1099428" vmhwm="1276787" />
+ <model path="caffe/FP32/resnet_v1_152/resnet_v1_152.xml" test="create_exenetwork" device="CPU" vmsize="764878" vmpeak="1551919" vmrss="58638" vmhwm="828383" />
+ <model path="caffe/FP32/resnet_v1_152/resnet_v1_152.xml" test="create_exenetwork" device="GPU" vmsize="1315120" vmpeak="1977250" vmrss="968858" vmhwm="1694796" />
+ <model path="caffe/FP32/resnet_v1_152/resnet_v1_152.xml" test="infer_request_inference" device="CPU" vmsize="1526166" vmpeak="1598256" vmrss="582401" vmhwm="829598" />
+ <model path="caffe/FP32/resnet_v1_152/resnet_v1_152.xml" test="infer_request_inference" device="GPU" vmsize="1804748" vmpeak="1975855" vmrss="1451397" vmhwm="1693419" />
+ <model path="caffe/FP32/resnet_v1_269/resnet_v1_269.xml" test="create_exenetwork" device="CPU" vmsize="927665" vmpeak="2236845" vmrss="224034" vmhwm="1396458" />
+ <model path="caffe/FP32/resnet_v1_269/resnet_v1_269.xml" test="create_exenetwork" device="GPU" vmsize="1988676" vmpeak="3156291" vmrss="1643919" vmhwm="2874946" />
+ <model path="caffe/FP32/resnet_v1_269/resnet_v1_269.xml" test="infer_request_inference" device="CPU" vmsize="2016999" vmpeak="2236955" vmrss="1117754" vmhwm="1396128" />
+ <model path="caffe/FP32/resnet_v1_269/resnet_v1_269.xml" test="infer_request_inference" device="GPU" vmsize="2845849" vmpeak="3165219" vmrss="2493550" vmhwm="2883091" />
+ <model path="caffe/FP32/resnet_v1_50/resnet_v1_50.xml" test="create_exenetwork" device="CPU" vmsize="766101" vmpeak="1079971" vmrss="27359" vmhwm="362142" />
+ <model path="caffe/FP32/resnet_v1_50/resnet_v1_50.xml" test="create_exenetwork" device="GPU" vmsize="834856" vmpeak="1080094" vmrss="490089" vmhwm="799312" />
+ <model path="caffe/FP32/resnet_v1_50/resnet_v1_50.xml" test="infer_request_inference" device="CPU" vmsize="1046381" vmpeak="1118471" vmrss="260528" vmhwm="362203" />
+ <model path="caffe/FP32/resnet_v1_50/resnet_v1_50.xml" test="infer_request_inference" device="GPU" vmsize="1060109" vmpeak="1132199" vmrss="707876" vmhwm="804108" />
+ <model path="caffe/FP32/se_bn_inception/se_bn_inception.xml" test="create_exenetwork" device="CPU" vmsize="758516" vmpeak="930397" vmrss="40572" vmhwm="194062" />
+ <model path="caffe/FP32/se_bn_inception/se_bn_inception.xml" test="create_exenetwork" device="GPU" vmsize="873061" vmpeak="1013430" vmrss="528167" vmhwm="692564" />
+ <model path="caffe/FP32/se_bn_inception/se_bn_inception.xml" test="infer_request_inference" device="CPU" vmsize="957620" vmpeak="1029710" vmrss="152754" vmhwm="194656" />
+ <model path="caffe/FP32/se_bn_inception/se_bn_inception.xml" test="infer_request_inference" device="GPU" vmsize="1014305" vmpeak="1086395" vmrss="662525" vmhwm="694821" />
+ <model path="caffe/FP32/se_resnext_50/se_resnext_50.xml" test="create_exenetwork" device="CPU" vmsize="759382" vmpeak="1174707" vmrss="39265" vmhwm="401856" />
+ <model path="caffe/FP32/se_resnext_50/se_resnext_50.xml" test="create_exenetwork" device="GPU" vmsize="983083" vmpeak="1257471" vmrss="637335" vmhwm="975444" />
+ <model path="caffe/FP32/se_resnext_50/se_resnext_50.xml" test="infer_request_inference" device="CPU" vmsize="1140730" vmpeak="1174672" vmrss="315977" vmhwm="401508" />
+ <model path="caffe/FP32/se_resnext_50/se_resnext_50.xml" test="infer_request_inference" device="GPU" vmsize="1251214" vmpeak="1323304" vmrss="899034" vmhwm="976474" />
+ <model path="caffe/FP32/squeezenet_v1.0/squeezenet_v1.0.xml" test="create_exenetwork" device="CPU" vmsize="754890" vmpeak="815095" vmrss="28833" vmhwm="43881" />
+ <model path="caffe/FP32/squeezenet_v1.0/squeezenet_v1.0.xml" test="create_exenetwork" device="GPU" vmsize="651974" vmpeak="746719" vmrss="306455" vmhwm="321345" />
+ <model path="caffe/FP32/squeezenet_v1.0/squeezenet_v1.0.xml" test="infer_request_inference" device="CPU" vmsize="824942" vmpeak="897032" vmrss="48567" vmhwm="48567" />
+ <model path="caffe/FP32/squeezenet_v1.0/squeezenet_v1.0.xml" test="infer_request_inference" device="GPU" vmsize="676328" vmpeak="748418" vmrss="324860" vmhwm="324860" />
+ <model path="caffe/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="create_exenetwork" device="CPU" vmsize="758212" vmpeak="813208" vmrss="29691" vmhwm="44220" />
+ <model path="caffe/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="create_exenetwork" device="GPU" vmsize="611789" vmpeak="706534" vmrss="266244" vmhwm="324007" />
+ <model path="caffe/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="infer_request_inference" device="CPU" vmsize="818549" vmpeak="890639" vmrss="47141" vmhwm="47141" />
+ <model path="caffe/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="infer_request_inference" device="GPU" vmsize="677705" vmpeak="749795" vmrss="326163" vmhwm="326163" />
+ <model path="caffe/FP32/ssd_googlenet/ssd_googlenet.xml" test="create_exenetwork" device="CPU" vmsize="757534" vmpeak="911495" vmrss="36445" vmhwm="182050" />
+ <model path="caffe/FP32/ssd_googlenet/ssd_googlenet.xml" test="create_exenetwork" device="GPU" vmsize="835683" vmpeak="973280" vmrss="490613" vmhwm="658640" />
+ <model path="caffe/FP32/ssd_googlenet/ssd_googlenet.xml" test="infer_request_inference" device="CPU" vmsize="941076" vmpeak="1013166" vmrss="148222" vmhwm="183185" />
+ <model path="caffe/FP32/ssd_googlenet/ssd_googlenet.xml" test="infer_request_inference" device="GPU" vmsize="989608" vmpeak="1061698" vmrss="637709" vmhwm="661746" />
+ <model path="caffe/FP32/ssd_mobilenet/ssd_mobilenet.xml" test="create_exenetwork" device="CPU" vmsize="757174" vmpeak="901648" vmrss="73409" vmhwm="106537" />
+ <model path="caffe/FP32/ssd_mobilenet/ssd_mobilenet.xml" test="create_exenetwork" device="GPU" vmsize="801644" vmpeak="915186" vmrss="456517" vmhwm="490520" />
+ <model path="caffe/FP32/ssd_mobilenet/ssd_mobilenet.xml" test="infer_request_inference" device="CPU" vmsize="847932" vmpeak="847932" vmrss="116410" vmhwm="116410" />
+ <model path="caffe/FP32/ssd_mobilenet/ssd_mobilenet.xml" test="infer_request_inference" device="GPU" vmsize="843022" vmpeak="915112" vmrss="490864" vmhwm="490864" />
+ <model path="caffe/FP32/ssd_squeezenet/ssd_squeezenet.xml" test="create_exenetwork" device="CPU" vmsize="765393" vmpeak="900402" vmrss="71544" vmhwm="105032" />
+ <model path="caffe/FP32/ssd_squeezenet/ssd_squeezenet.xml" test="create_exenetwork" device="GPU" vmsize="759668" vmpeak="872762" vmrss="414493" vmhwm="497701" />
+ <model path="caffe/FP32/ssd_squeezenet/ssd_squeezenet.xml" test="infer_request_inference" device="CPU" vmsize="848438" vmpeak="900754" vmrss="113590" vmhwm="113590" />
+ <model path="caffe/FP32/ssd_squeezenet/ssd_squeezenet.xml" test="infer_request_inference" device="GPU" vmsize="847620" vmpeak="919710" vmrss="495730" vmhwm="495730" />
+ <model path="caffe/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="create_exenetwork" device="CPU" vmsize="755374" vmpeak="1146156" vmrss="22026" vmhwm="370176" />
+ <model path="caffe/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="create_exenetwork" device="GPU" vmsize="768451" vmpeak="1074730" vmrss="423662" vmhwm="794266" />
+ <model path="caffe/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="infer_request_inference" device="CPU" vmsize="1113609" vmpeak="1185698" vmrss="313513" vmhwm="370035" />
+ <model path="caffe/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="infer_request_inference" device="GPU" vmsize="1134227" vmpeak="1206317" vmrss="783006" vmhwm="795000" />
+ <model path="caffe/FP32/ssd_vgg16_512/ssd_vgg16_512.xml" test="create_exenetwork" device="CPU" vmsize="755796" vmpeak="1267802" vmrss="23746" vmhwm="383983" />
+ <model path="caffe/FP32/ssd_vgg16_512/ssd_vgg16_512.xml" test="create_exenetwork" device="GPU" vmsize="794565" vmpeak="1272634" vmrss="449394" vmhwm="991632" />
+ <model path="caffe/FP32/ssd_vgg16_512/ssd_vgg16_512.xml" test="infer_request_inference" device="CPU" vmsize="1234050" vmpeak="1306140" vmrss="421194" vmhwm="421194" />
+ <model path="caffe/FP32/ssd_vgg16_512/ssd_vgg16_512.xml" test="infer_request_inference" device="GPU" vmsize="1348960" vmpeak="1421050" vmrss="999050" vmhwm="999050" />
+ <model path="caffe/FP32/vgg16/vgg16.xml" test="create_exenetwork" device="CPU" vmsize="754006" vmpeak="2548497" vmrss="15598" vmhwm="1808624" />
+ <model path="caffe/FP32/vgg16/vgg16.xml" test="create_exenetwork" device="GPU" vmsize="668602" vmpeak="3326708" vmrss="323791" vmhwm="3045328" />
+ <model path="caffe/FP32/vgg16/vgg16.xml" test="infer_request_inference" device="CPU" vmsize="2027181" vmpeak="2548497" vmrss="1242560" vmhwm="1808730" />
+ <model path="caffe/FP32/vgg16/vgg16.xml" test="infer_request_inference" device="GPU" vmsize="2441076" vmpeak="3326708" vmrss="2088055" vmhwm="3045050" />
+ <model path="caffe/FP32/vgg19/vgg19.xml" test="create_exenetwork" device="CPU" vmsize="754212" vmpeak="2618030" vmrss="15510" vmhwm="1877383" />
+ <model path="caffe/FP32/vgg19/vgg19.xml" test="create_exenetwork" device="GPU" vmsize="739222" vmpeak="3397112" vmrss="393866" vmhwm="3115085" />
+ <model path="caffe/FP32/vgg19/vgg19.xml" test="infer_request_inference" device="CPU" vmsize="2073794" vmpeak="2618030" vmrss="1289741" vmhwm="1878289" />
+ <model path="caffe/FP32/vgg19/vgg19.xml" test="infer_request_inference" device="GPU" vmsize="2518340" vmpeak="3397081" vmrss="2165196" vmhwm="3114975" />
+ <model path="caffe/FP32/vnect/vnect.xml" test="create_exenetwork" device="CPU" vmsize="764940" vmpeak="947157" vmrss="27988" vmhwm="223726" />
+ <model path="caffe/FP32/vnect/vnect.xml" test="create_exenetwork" device="GPU" vmsize="789223" vmpeak="941683" vmrss="443788" vmhwm="641476" />
+ <model path="caffe/FP32/vnect/vnect.xml" test="infer_request_inference" device="CPU" vmsize="962187" vmpeak="1034277" vmrss="177848" vmhwm="224180" />
+ <model path="caffe/FP32/vnect/vnect.xml" test="infer_request_inference" device="GPU" vmsize="969069" vmpeak="1041158" vmrss="616990" vmhwm="641977" />
+ <model path="caffe/FP32/wrn_50_2/wrn_50_2.xml" test="create_exenetwork" device="CPU" vmsize="755651" vmpeak="1654985" vmrss="24921" vmhwm="920400" />
+ <model path="caffe/FP32/wrn_50_2/wrn_50_2.xml" test="create_exenetwork" device="GPU" vmsize="936892" vmpeak="1838610" vmrss="590994" vmhwm="1556526" />
+ <model path="caffe/FP32/wrn_50_2/wrn_50_2.xml" test="infer_request_inference" device="CPU" vmsize="1433352" vmpeak="1654989" vmrss="639456" vmhwm="918693" />
+ <model path="caffe/FP32/wrn_50_2/wrn_50_2.xml" test="infer_request_inference" device="GPU" vmsize="1613176" vmpeak="1824922" vmrss="1259940" vmhwm="1543031" />
+ <model path="caffe/FP32/yolo_v1_full/yolo_v1_full.xml" test="create_exenetwork" device="CPU" vmsize="754692" vmpeak="4259393" vmrss="18013" vmhwm="3532412" />
+ <model path="caffe/FP32/yolo_v1_full/yolo_v1_full.xml" test="create_exenetwork" device="GPU" vmsize="719105" vmpeak="5906194" vmrss="373648" vmhwm="5623600" />
+ <model path="caffe/FP32/yolo_v1_full/yolo_v1_full.xml" test="infer_request_inference" device="CPU" vmsize="3167040" vmpeak="4259380" vmrss="2378362" vmhwm="3531237" />
+ <model path="caffe/FP32/yolo_v1_full/yolo_v1_full.xml" test="infer_request_inference" device="GPU" vmsize="4165801" vmpeak="5903801" vmrss="3812393" vmhwm="5621585" />
+ <model path="caffe/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="create_exenetwork" device="CPU" vmsize="753860" vmpeak="1101161" vmrss="14599" vmhwm="375399" />
+ <model path="caffe/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="create_exenetwork" device="GPU" vmsize="577640" vmpeak="1037480" vmrss="232443" vmhwm="755972" />
+ <model path="caffe/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="infer_request_inference" device="CPU" vmsize="1059828" vmpeak="1131917" vmrss="272879" vmhwm="374721" />
+ <model path="caffe/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="infer_request_inference" device="GPU" vmsize="957453" vmpeak="1037445" vmrss="605026" vmhwm="756606" />
+ <model path="caffe/FP32/yolo_v2/yolo_v2.xml" test="create_exenetwork" device="CPU" vmsize="754344" vmpeak="1422647" vmrss="16790" vmhwm="680072" />
+ <model path="caffe/FP32/yolo_v2/yolo_v2.xml" test="create_exenetwork" device="GPU" vmsize="678964" vmpeak="1435790" vmrss="334017" vmhwm="1154573" />
+ <model path="caffe/FP32/yolo_v2/yolo_v2.xml" test="infer_request_inference" device="CPU" vmsize="1279823" vmpeak="1422647" vmrss="490692" vmhwm="680526" />
+ <model path="caffe/FP32/yolo_v2/yolo_v2.xml" test="infer_request_inference" device="GPU" vmsize="1325156" vmpeak="1438571" vmrss="972140" vmhwm="1157138" />
+ <model path="caffe/FP32/yolo_v2_tiny/yolo_v2_tiny.xml" test="create_exenetwork" device="CPU" vmsize="753733" vmpeak="954430" vmrss="14278" vmhwm="229913" />
+ <model path="caffe/FP32/yolo_v2_tiny/yolo_v2_tiny.xml" test="create_exenetwork" device="GPU" vmsize="568880" vmpeak="814976" vmrss="223907" vmhwm="533808" />
+ <model path="caffe/FP32/yolo_v2_tiny/yolo_v2_tiny.xml" test="infer_request_inference" device="CPU" vmsize="1032882" vmpeak="1032882" vmrss="174631" vmhwm="230243" />
+ <model path="caffe/FP32/yolo_v2_tiny/yolo_v2_tiny.xml" test="infer_request_inference" device="GPU" vmsize="810031" vmpeak="816178" vmrss="456856" vmhwm="534503" />
+ <model path="caffe/FP32/yolo_v3/yolo_v3.xml" test="create_exenetwork" device="CPU" vmsize="756852" vmpeak="1587154" vmrss="31460" vmhwm="837570" />
+ <model path="caffe/FP32/yolo_v3/yolo_v3.xml" test="create_exenetwork" device="GPU" vmsize="1159840" vmpeak="1822444" vmrss="813969" vmhwm="1540343" />
+ <model path="caffe/FP32/yolo_v3/yolo_v3.xml" test="infer_request_inference" device="CPU" vmsize="1554462" vmpeak="1626552" vmrss="609677" vmhwm="836655" />
+ <model path="caffe/FP32/yolo_v3/yolo_v3.xml" test="infer_request_inference" device="GPU" vmsize="1735610" vmpeak="1821749" vmrss="1383285" vmhwm="1540598" />
+ <model path="mxnet/FP32/caffenet/caffenet.xml" test="create_exenetwork" device="CPU" vmsize="753856" vmpeak="1528538" vmrss="14414" vmhwm="815491" />
+ <model path="mxnet/FP32/caffenet/caffenet.xml" test="create_exenetwork" device="GPU" vmsize="580030" vmpeak="1741062" vmrss="235624" vmhwm="1460386" />
+ <model path="mxnet/FP32/caffenet/caffenet.xml" test="infer_request_inference" device="CPU" vmsize="1339681" vmpeak="1528538" vmrss="556146" vmhwm="815262" />
+ <model path="mxnet/FP32/caffenet/caffenet.xml" test="infer_request_inference" device="GPU" vmsize="1389097" vmpeak="1741093" vmrss="1036178" vmhwm="1460060" />
+ <model path="mxnet/FP32/densenet_121/densenet_121.xml" test="create_exenetwork" device="CPU" vmsize="772622" vmpeak="985749" vmrss="95431" vmhwm="151087" />
+ <model path="mxnet/FP32/densenet_121/densenet_121.xml" test="create_exenetwork" device="GPU" vmsize="1141962" vmpeak="1252068" vmrss="796734" vmhwm="827217" />
+ <model path="mxnet/FP32/densenet_121/densenet_121.xml" test="infer_request_inference" device="CPU" vmsize="985239" vmpeak="1057328" vmrss="158532" vmhwm="158532" />
+ <model path="mxnet/FP32/densenet_121/densenet_121.xml" test="infer_request_inference" device="GPU" vmsize="1171425" vmpeak="1243514" vmrss="818624" vmhwm="818624" />
+ <model path="mxnet/FP32/densenet_161/densenet_161.xml" test="create_exenetwork" device="CPU" vmsize="762731" vmpeak="1211720" vmrss="93486" vmhwm="426896" />
+ <model path="mxnet/FP32/densenet_161/densenet_161.xml" test="create_exenetwork" device="GPU" vmsize="1312801" vmpeak="1592839" vmrss="967252" vmhwm="1311569" />
+ <model path="mxnet/FP32/densenet_161/densenet_161.xml" test="infer_request_inference" device="CPU" vmsize="1198124" vmpeak="1270214" vmrss="353051" vmhwm="427319" />
+ <model path="mxnet/FP32/densenet_161/densenet_161.xml" test="infer_request_inference" device="GPU" vmsize="1657339" vmpeak="1729428" vmrss="1304820" vmhwm="1304820" />
+ <model path="mxnet/FP32/densenet_169/densenet_169.xml" test="create_exenetwork" device="CPU" vmsize="796360" vmpeak="1002408" vmrss="123094" vmhwm="239945" />
+ <model path="mxnet/FP32/densenet_169/densenet_169.xml" test="create_exenetwork" device="GPU" vmsize="1352916" vmpeak="1472262" vmrss="1007630" vmhwm="1084727" />
+ <model path="mxnet/FP32/densenet_169/densenet_169.xml" test="infer_request_inference" device="CPU" vmsize="1059880" vmpeak="1059880" vmrss="239307" vmhwm="241753" />
+ <model path="mxnet/FP32/densenet_169/densenet_169.xml" test="infer_request_inference" device="GPU" vmsize="1437656" vmpeak="1509745" vmrss="1084828" vmhwm="1084828" />
+ <model path="mxnet/FP32/densenet_201/densenet_201.xml" test="create_exenetwork" device="CPU" vmsize="864635" vmpeak="1154040" vmrss="148830" vmhwm="322528" />
+ <model path="mxnet/FP32/densenet_201/densenet_201.xml" test="create_exenetwork" device="GPU" vmsize="1505042" vmpeak="1650162" vmrss="1159906" vmhwm="1343711" />
+ <model path="mxnet/FP32/densenet_201/densenet_201.xml" test="infer_request_inference" device="CPU" vmsize="1181056" vmpeak="1253146" vmrss="315048" vmhwm="322282" />
+ <model path="mxnet/FP32/densenet_201/densenet_201.xml" test="infer_request_inference" device="GPU" vmsize="1719256" vmpeak="1791345" vmrss="1366767" vmhwm="1366767" />
+ <model path="mxnet/FP32/dpn_92/dpn_92.xml" test="create_exenetwork" device="CPU" vmsize="767976" vmpeak="1370195" vmrss="63456" vmhwm="539897" />
+ <model path="mxnet/FP32/dpn_92/dpn_92.xml" test="create_exenetwork" device="GPU" vmsize="1313452" vmpeak="1701664" vmrss="968145" vmhwm="1420434" />
+ <model path="mxnet/FP32/dpn_92/dpn_92.xml" test="infer_request_inference" device="CPU" vmsize="1295571" vmpeak="1370195" vmrss="430610" vmhwm="539536" />
+ <model path="mxnet/FP32/dpn_92/dpn_92.xml" test="infer_request_inference" device="GPU" vmsize="1651421" vmpeak="1723510" vmrss="1299738" vmhwm="1422326" />
+ <model path="mxnet/FP32/fcn8s_vgg16/fcn8s_vgg16.xml" test="create_exenetwork" device="CPU" vmsize="754212" vmpeak="3124338" vmrss="17362" vmhwm="1770388" />
+ <model path="mxnet/FP32/fcn8s_vgg16/fcn8s_vgg16.xml" test="create_exenetwork" device="GPU" vmsize="669583" vmpeak="3628222" vmrss="324363" vmhwm="3347071" />
+ <model path="mxnet/FP32/fcn8s_vgg16/fcn8s_vgg16.xml" test="infer_request_inference" device="CPU" vmsize="2705824" vmpeak="3124338" vmrss="1906933" vmhwm="1906933" />
+ <model path="mxnet/FP32/fcn8s_vgg16/fcn8s_vgg16.xml" test="infer_request_inference" device="GPU" vmsize="3710449" vmpeak="3782539" vmrss="3356861" vmhwm="3356861" />
+ <model path="mxnet/FP32/full_imagenet_network/full_imagenet_network.xml" test="create_exenetwork" device="CPU" vmsize="756870" vmpeak="1192276" vmrss="32300" vmhwm="470417" />
+ <model path="mxnet/FP32/full_imagenet_network/full_imagenet_network.xml" test="create_exenetwork" device="GPU" vmsize="772970" vmpeak="1363872" vmrss="428054" vmhwm="1079412" />
+ <model path="mxnet/FP32/full_imagenet_network/full_imagenet_network.xml" test="infer_request_inference" device="CPU" vmsize="1123746" vmpeak="1195836" vmrss="335288" vmhwm="470162" />
+ <model path="mxnet/FP32/full_imagenet_network/full_imagenet_network.xml" test="infer_request_inference" device="GPU" vmsize="1219618" vmpeak="1362376" vmrss="875415" vmhwm="1077560" />
+ <model path="mxnet/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="create_exenetwork" device="CPU" vmsize="848157" vmpeak="1522730" vmrss="178424" vmhwm="792470" />
+ <model path="mxnet/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="create_exenetwork" device="GPU" vmsize="1549574" vmpeak="2182501" vmrss="1203804" vmhwm="1900742" />
+ <model path="mxnet/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="infer_request_inference" device="CPU" vmsize="1437730" vmpeak="1522730" vmrss="644402" vmhwm="794024" />
+ <model path="mxnet/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="infer_request_inference" device="GPU" vmsize="2145426" vmpeak="2217516" vmrss="1793162" vmhwm="1899854" />
+ <model path="mxnet/FP32/inception_v3/inception_v3.xml" test="create_exenetwork" device="CPU" vmsize="756584" vmpeak="925636" vmrss="32982" vmhwm="182529" />
+ <model path="mxnet/FP32/inception_v3/inception_v3.xml" test="create_exenetwork" device="GPU" vmsize="769230" vmpeak="907847" vmrss="423874" vmhwm="604982" />
+ <model path="mxnet/FP32/inception_v3/inception_v3.xml" test="infer_request_inference" device="CPU" vmsize="928659" vmpeak="928659" vmrss="142304" vmhwm="182353" />
+ <model path="mxnet/FP32/inception_v3/inception_v3.xml" test="infer_request_inference" device="GPU" vmsize="926103" vmpeak="998192" vmrss="572985" vmhwm="603592" />
+ <model path="mxnet/FP32/inception_v3_no_batchnorm/inception_v3_no_batchnorm.xml" test="create_exenetwork" device="CPU" vmsize="757851" vmpeak="1078682" vmrss="34751" vmhwm="348154" />
+ <model path="mxnet/FP32/inception_v3_no_batchnorm/inception_v3_no_batchnorm.xml" test="create_exenetwork" device="GPU" vmsize="911473" vmpeak="1183102" vmrss="565549" vmhwm="900992" />
+ <model path="mxnet/FP32/inception_v3_no_batchnorm/inception_v3_no_batchnorm.xml" test="infer_request_inference" device="CPU" vmsize="1051652" vmpeak="1123742" vmrss="258231" vmhwm="349131" />
+ <model path="mxnet/FP32/inception_v3_no_batchnorm/inception_v3_no_batchnorm.xml" test="infer_request_inference" device="GPU" vmsize="1182570" vmpeak="1254660" vmrss="829659" vmhwm="899540" />
+ <model path="mxnet/FP32/inception_v4/inception_v4.xml" test="create_exenetwork" device="CPU" vmsize="764319" vmpeak="1327506" vmrss="61375" vmhwm="601048" />
+ <model path="mxnet/FP32/inception_v4/inception_v4.xml" test="create_exenetwork" device="GPU" vmsize="1206559" vmpeak="1676272" vmrss="860362" vmhwm="1393906" />
+ <model path="mxnet/FP32/inception_v4/inception_v4.xml" test="infer_request_inference" device="CPU" vmsize="1228396" vmpeak="1327475" vmrss="441135" vmhwm="603394" />
+ <model path="mxnet/FP32/inception_v4/inception_v4.xml" test="infer_request_inference" device="GPU" vmsize="1637486" vmpeak="1709576" vmrss="1285376" vmhwm="1398377" />
+ <model path="mxnet/FP32/location_net/location_net.xml" test="create_exenetwork" device="CPU" vmsize="761046" vmpeak="1754029" vmrss="43916" vmhwm="1002368" />
+ <model path="mxnet/FP32/location_net/location_net.xml" test="create_exenetwork" device="GPU" vmsize="1026110" vmpeak="2108686" vmrss="680191" vmhwm="1826792" />
+ <model path="mxnet/FP32/location_net/location_net.xml" test="infer_request_inference" device="CPU" vmsize="1512095" vmpeak="1753998" vmrss="701483" vmhwm="1002333" />
+ <model path="mxnet/FP32/location_net/location_net.xml" test="infer_request_inference" device="GPU" vmsize="1880973" vmpeak="2110306" vmrss="1532348" vmhwm="1828952" />
+ <model path="mxnet/FP32/lresnet100e/lresnet100e.xml" test="create_exenetwork" device="CPU" vmsize="759695" vmpeak="1636430" vmrss="38011" vmhwm="883225" />
+ <model path="mxnet/FP32/lresnet100e/lresnet100e.xml" test="create_exenetwork" device="GPU" vmsize="1118880" vmpeak="1994964" vmrss="773102" vmhwm="1713034" />
+ <model path="mxnet/FP32/lresnet100e/lresnet100e.xml" test="infer_request_inference" device="CPU" vmsize="1430871" vmpeak="1636434" vmrss="617078" vmhwm="882886" />
+ <model path="mxnet/FP32/lresnet100e/lresnet100e.xml" test="infer_request_inference" device="GPU" vmsize="1804484" vmpeak="1993530" vmrss="1450724" vmhwm="1711340" />
+ <model path="mxnet/FP32/mobilenet/mobilenet.xml" test="create_exenetwork" device="CPU" vmsize="754872" vmpeak="821893" vmrss="55070" vmhwm="82354" />
+ <model path="mxnet/FP32/mobilenet/mobilenet.xml" test="create_exenetwork" device="GPU" vmsize="626304" vmpeak="734201" vmrss="280918" vmhwm="362925" />
+ <model path="mxnet/FP32/mobilenet/mobilenet.xml" test="infer_request_inference" device="CPU" vmsize="831344" vmpeak="903434" vmrss="86495" vmhwm="86495" />
+ <model path="mxnet/FP32/mobilenet/mobilenet.xml" test="infer_request_inference" device="GPU" vmsize="718357" vmpeak="790446" vmrss="367096" vmhwm="367096" />
+ <model path="mxnet/FP32/mobilenet_v2/mobilenet_v2.xml" test="create_exenetwork" device="CPU" vmsize="756826" vmpeak="819711" vmrss="53961" vmhwm="77206" />
+ <model path="mxnet/FP32/mobilenet_v2/mobilenet_v2.xml" test="create_exenetwork" device="GPU" vmsize="758023" vmpeak="861784" vmrss="412702" vmhwm="436805" />
+ <model path="mxnet/FP32/mobilenet_v2/mobilenet_v2.xml" test="infer_request_inference" device="CPU" vmsize="836470" vmpeak="891765" vmrss="83050" vmhwm="83050" />
+ <model path="mxnet/FP32/mobilenet_v2/mobilenet_v2.xml" test="infer_request_inference" device="GPU" vmsize="788986" vmpeak="861075" vmrss="437646" vmhwm="437646" />
+ <model path="mxnet/FP32/mtcnn_o/mtcnn_o.xml" test="create_exenetwork" device="CPU" vmsize="762731" vmpeak="804491" vmrss="17490" vmhwm="28454" />
+ <model path="mxnet/FP32/mtcnn_o/mtcnn_o.xml" test="create_exenetwork" device="GPU" vmsize="578894" vmpeak="670546" vmrss="233547" vmhwm="245172" />
+ <model path="mxnet/FP32/mtcnn_o/mtcnn_o.xml" test="infer_request_inference" device="CPU" vmsize="808209" vmpeak="808209" vmrss="28314" vmhwm="28314" />
+ <model path="mxnet/FP32/mtcnn_o/mtcnn_o.xml" test="infer_request_inference" device="GPU" vmsize="600507" vmpeak="672597" vmrss="247596" vmhwm="247596" />
+ <model path="mxnet/FP32/mtcnn_p/mtcnn_p.xml" test="create_exenetwork" device="CPU" vmsize="753530" vmpeak="881588" vmrss="13208" vmhwm="35261" />
+ <model path="mxnet/FP32/mtcnn_p/mtcnn_p.xml" test="create_exenetwork" device="GPU" vmsize="570042" vmpeak="661702" vmrss="224870" vmhwm="353003" />
+ <model path="mxnet/FP32/mtcnn_p/mtcnn_p.xml" test="infer_request_inference" device="CPU" vmsize="901260" vmpeak="901260" vmrss="107390" vmhwm="107390" />
+ <model path="mxnet/FP32/mtcnn_p/mtcnn_p.xml" test="infer_request_inference" device="GPU" vmsize="686408" vmpeak="758498" vmrss="332895" vmhwm="351907" />
+ <model path="mxnet/FP32/mtcnn_r/mtcnn_r.xml" test="create_exenetwork" device="CPU" vmsize="753711" vmpeak="803228" vmrss="14546" vmhwm="25586" />
+ <model path="mxnet/FP32/mtcnn_r/mtcnn_r.xml" test="create_exenetwork" device="GPU" vmsize="577288" vmpeak="667682" vmrss="231642" vmhwm="242167" />
+ <model path="mxnet/FP32/mtcnn_r/mtcnn_r.xml" test="infer_request_inference" device="CPU" vmsize="806102" vmpeak="806102" vmrss="24468" vmhwm="24468" />
+ <model path="mxnet/FP32/mtcnn_r/mtcnn_r.xml" test="infer_request_inference" device="GPU" vmsize="595588" vmpeak="667678" vmrss="242246" vmhwm="242246" />
+ <model path="mxnet/FP32/nin/nin.xml" test="create_exenetwork" device="CPU" vmsize="753838" vmpeak="907420" vmrss="80674" vmhwm="122086" />
+ <model path="mxnet/FP32/nin/nin.xml" test="create_exenetwork" device="GPU" vmsize="675633" vmpeak="798283" vmrss="330184" vmhwm="372754" />
+ <model path="mxnet/FP32/nin/nin.xml" test="infer_request_inference" device="CPU" vmsize="841390" vmpeak="913479" vmrss="123776" vmhwm="123776" />
+ <model path="mxnet/FP32/nin/nin.xml" test="infer_request_inference" device="GPU" vmsize="726066" vmpeak="798155" vmrss="390764" vmhwm="390764" />
+ <model path="mxnet/FP32/nst_vgg19/nst_vgg19.xml" test="create_exenetwork" device="CPU" vmsize="754080" vmpeak="884950" vmrss="35930" vmhwm="56368" />
+ <model path="mxnet/FP32/nst_vgg19/nst_vgg19.xml" test="create_exenetwork" device="GPU" vmsize="613082" vmpeak="713020" vmrss="267753" vmhwm="358019" />
+ <model path="mxnet/FP32/nst_vgg19/nst_vgg19.xml" test="infer_request_inference" device="CPU" vmsize="847726" vmpeak="919815" vmrss="83300" vmhwm="83300" />
+ <model path="mxnet/FP32/nst_vgg19/nst_vgg19.xml" test="infer_request_inference" device="GPU" vmsize="710754" vmpeak="782843" vmrss="357442" vmhwm="357442" />
+ <model path="mxnet/FP32/resnet_v1_101/resnet_v1_101.xml" test="create_exenetwork" device="CPU" vmsize="760821" vmpeak="1370292" vmrss="44242" vmhwm="618965" />
+ <model path="mxnet/FP32/resnet_v1_101/resnet_v1_101.xml" test="create_exenetwork" device="GPU" vmsize="1077643" vmpeak="1594964" vmrss="731733" vmhwm="1313127" />
+ <model path="mxnet/FP32/resnet_v1_101/resnet_v1_101.xml" test="infer_request_inference" device="CPU" vmsize="1256200" vmpeak="1370261" vmrss="444043" vmhwm="617852" />
+ <model path="mxnet/FP32/resnet_v1_101/resnet_v1_101.xml" test="infer_request_inference" device="GPU" vmsize="1494732" vmpeak="1596218" vmrss="1141690" vmhwm="1314187" />
+ <model path="mxnet/FP32/resnet_v1_152/resnet_v1_152.xml" test="create_exenetwork" device="CPU" vmsize="765322" vmpeak="1593790" vmrss="61120" vmhwm="831661" />
+ <model path="mxnet/FP32/resnet_v1_152/resnet_v1_152.xml" test="create_exenetwork" device="GPU" vmsize="1339184" vmpeak="2040148" vmrss="993968" vmhwm="1758746" />
+ <model path="mxnet/FP32/resnet_v1_152/resnet_v1_152.xml" test="infer_request_inference" device="CPU" vmsize="1414652" vmpeak="1593754" vmrss="594426" vmhwm="832220" />
+ <model path="mxnet/FP32/resnet_v1_152/resnet_v1_152.xml" test="infer_request_inference" device="GPU" vmsize="1871271" vmpeak="2037904" vmrss="1518501" vmhwm="1756343" />
+ <model path="mxnet/FP32/resnet_v2_101/resnet_v2_101.xml" test="create_exenetwork" device="CPU" vmsize="760650" vmpeak="1369557" vmrss="43384" vmhwm="618015" />
+ <model path="mxnet/FP32/resnet_v2_101/resnet_v2_101.xml" test="create_exenetwork" device="GPU" vmsize="1022863" vmpeak="1592206" vmrss="676698" vmhwm="1309880" />
+ <model path="mxnet/FP32/resnet_v2_101/resnet_v2_101.xml" test="infer_request_inference" device="CPU" vmsize="1255557" vmpeak="1369522" vmrss="445350" vmhwm="618750" />
+ <model path="mxnet/FP32/resnet_v2_101/resnet_v2_101.xml" test="infer_request_inference" device="GPU" vmsize="1490077" vmpeak="1591563" vmrss="1137444" vmhwm="1309910" />
+ <model path="mxnet/FP32/resnet_v2_152/resnet_v2_152.xml" test="create_exenetwork" device="CPU" vmsize="765204" vmpeak="1593108" vmrss="61124" vmhwm="831353" />
+ <model path="mxnet/FP32/resnet_v2_152/resnet_v2_152.xml" test="create_exenetwork" device="GPU" vmsize="1340754" vmpeak="2034586" vmrss="995636" vmhwm="1753100" />
+ <model path="mxnet/FP32/resnet_v2_152/resnet_v2_152.xml" test="infer_request_inference" device="CPU" vmsize="1413992" vmpeak="1593077" vmrss="592710" vmhwm="831098" />
+ <model path="mxnet/FP32/resnet_v2_152/resnet_v2_152.xml" test="infer_request_inference" device="GPU" vmsize="1867096" vmpeak="2036610" vmrss="1514532" vmhwm="1755089" />
+ <model path="mxnet/FP32/resnext_101/resnext_101.xml" test="create_exenetwork" device="CPU" vmsize="766911" vmpeak="1356080" vmrss="64389" vmhwm="623026" />
+ <model path="mxnet/FP32/resnext_101/resnext_101.xml" test="create_exenetwork" device="GPU" vmsize="1105068" vmpeak="1552320" vmrss="759990" vmhwm="1271340" />
+ <model path="mxnet/FP32/resnext_101/resnext_101.xml" test="infer_request_inference" device="CPU" vmsize="1258699" vmpeak="1356084" vmrss="468780" vmhwm="623788" />
+ <model path="mxnet/FP32/resnext_101/resnext_101.xml" test="infer_request_inference" device="GPU" vmsize="1478730" vmpeak="1553591" vmrss="1126364" vmhwm="1272167" />
+ <model path="mxnet/FP32/resnext_101_64x4d/resnext_101_64x4d.xml" test="create_exenetwork" device="CPU" vmsize="761239" vmpeak="1894468" vmrss="40691" vmhwm="1139410" />
+ <model path="mxnet/FP32/resnext_101_64x4d/resnext_101_64x4d.xml" test="create_exenetwork" device="GPU" vmsize="1418938" vmpeak="2248351" vmrss="1073886" vmhwm="1967262" />
+ <model path="mxnet/FP32/resnext_101_64x4d/resnext_101_64x4d.xml" test="infer_request_inference" device="CPU" vmsize="1618592" vmpeak="1894499" vmrss="810946" vmhwm="1140422" />
+ <model path="mxnet/FP32/resnext_101_64x4d/resnext_101_64x4d.xml" test="infer_request_inference" device="GPU" vmsize="1996112" vmpeak="2247322" vmrss="1660700" vmhwm="1965405" />
+ <model path="mxnet/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="create_exenetwork" device="CPU" vmsize="754987" vmpeak="880664" vmrss="29475" vmhwm="43832" />
+ <model path="mxnet/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="create_exenetwork" device="GPU" vmsize="616360" vmpeak="711106" vmrss="270859" vmhwm="322498" />
+ <model path="mxnet/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="infer_request_inference" device="CPU" vmsize="818562" vmpeak="818562" vmrss="47141" vmhwm="47141" />
+ <model path="mxnet/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="infer_request_inference" device="GPU" vmsize="674124" vmpeak="746213" vmrss="322731" vmhwm="322731" />
+ <model path="mxnet/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="create_exenetwork" device="CPU" vmsize="755224" vmpeak="1146433" vmrss="21806" vmhwm="370044" />
+ <model path="mxnet/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="create_exenetwork" device="GPU" vmsize="775324" vmpeak="1077709" vmrss="430342" vmhwm="796857" />
+ <model path="mxnet/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="infer_request_inference" device="CPU" vmsize="1113904" vmpeak="1185993" vmrss="312527" vmhwm="370946" />
+ <model path="mxnet/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="infer_request_inference" device="GPU" vmsize="1137391" vmpeak="1137391" vmrss="785391" vmhwm="793201" />
+ <model path="mxnet/FP32/vgg16/vgg16.xml" test="create_exenetwork" device="CPU" vmsize="754133" vmpeak="2548906" vmrss="14955" vmhwm="1807044" />
+ <model path="mxnet/FP32/vgg16/vgg16.xml" test="create_exenetwork" device="GPU" vmsize="668619" vmpeak="3326725" vmrss="322691" vmhwm="3044404" />
+ <model path="mxnet/FP32/vgg16/vgg16.xml" test="infer_request_inference" device="CPU" vmsize="2027476" vmpeak="2548906" vmrss="1242678" vmhwm="1808470" />
+ <model path="mxnet/FP32/vgg16/vgg16.xml" test="infer_request_inference" device="GPU" vmsize="2438563" vmpeak="3326725" vmrss="2085028" vmhwm="3044505" />
+ <model path="mxnet/FP32/vgg19/vgg19.xml" test="create_exenetwork" device="CPU" vmsize="754226" vmpeak="2618325" vmrss="15708" vmhwm="1877977" />
+ <model path="mxnet/FP32/vgg19/vgg19.xml" test="create_exenetwork" device="GPU" vmsize="741092" vmpeak="3397116" vmrss="396074" vmhwm="3115345" />
+ <model path="mxnet/FP32/vgg19/vgg19.xml" test="infer_request_inference" device="CPU" vmsize="2074089" vmpeak="2618325" vmrss="1290049" vmhwm="1878672" />
+ <model path="mxnet/FP32/vgg19/vgg19.xml" test="infer_request_inference" device="GPU" vmsize="2518436" vmpeak="3397178" vmrss="2165728" vmhwm="3115459" />
+ <model path="mxnet/FP32/yolo_v1_full/yolo_v1_full.xml" test="create_exenetwork" device="CPU" vmsize="754701" vmpeak="4259684" vmrss="17626" vmhwm="3531853" />
+ <model path="mxnet/FP32/yolo_v1_full/yolo_v1_full.xml" test="create_exenetwork" device="GPU" vmsize="747582" vmpeak="5921322" vmrss="402490" vmhwm="5639084" />
+ <model path="mxnet/FP32/yolo_v1_full/yolo_v1_full.xml" test="infer_request_inference" device="CPU" vmsize="3095241" vmpeak="4259670" vmrss="2379062" vmhwm="3530652" />
+ <model path="mxnet/FP32/yolo_v1_full/yolo_v1_full.xml" test="infer_request_inference" device="GPU" vmsize="4163667" vmpeak="5923566" vmrss="3810193" vmhwm="5640967" />
+ <model path="mxnet/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="create_exenetwork" device="CPU" vmsize="754023" vmpeak="1334414" vmrss="15254" vmhwm="608322" />
+ <model path="mxnet/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="create_exenetwork" device="GPU" vmsize="600701" vmpeak="1330978" vmrss="255912" vmhwm="1049844" />
+ <model path="mxnet/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="infer_request_inference" device="CPU" vmsize="1215838" vmpeak="1334383" vmrss="428331" vmhwm="607442" />
+ <model path="mxnet/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="infer_request_inference" device="GPU" vmsize="1199972" vmpeak="1330384" vmrss="847391" vmhwm="1049228" />
+ <model path="onnx/FP32/ssd_resnet34/ssd_resnet34.xml" test="create_exenetwork" device="CPU" vmsize="755387" vmpeak="1175570" vmrss="25374" vmhwm="306904" />
+ <model path="onnx/FP32/ssd_resnet34/ssd_resnet34.xml" test="create_exenetwork" device="GPU" vmsize="805222" vmpeak="1346307" vmrss="460781" vmhwm="1065873" />
+ <model path="onnx/FP32/ssd_resnet34/ssd_resnet34.xml" test="infer_request_inference" device="CPU" vmsize="1188580" vmpeak="1260670" vmrss="336036" vmhwm="336036" />
+ <model path="onnx/FP32/ssd_resnet34/ssd_resnet34.xml" test="infer_request_inference" device="GPU" vmsize="1449408" vmpeak="1521498" vmrss="1096792" vmhwm="1096792" />
+ <model path="onnx/FP32/ssd_resnet34_new/ssd_resnet34_new.xml" test="create_exenetwork" device="CPU" vmsize="756822" vmpeak="1181615" vmrss="28468" vmhwm="309716" />
+ <model path="onnx/FP32/ssd_resnet34_new/ssd_resnet34_new.xml" test="create_exenetwork" device="GPU" vmsize="819271" vmpeak="2432738" vmrss="474764" vmhwm="1101047" />
+ <model path="onnx/FP32/ssd_resnet34_new/ssd_resnet34_new.xml" test="infer_request_inference" device="CPU" vmsize="1189117" vmpeak="1261207" vmrss="333788" vmhwm="333788" />
+ <model path="onnx/FP32/ssd_resnet34_new/ssd_resnet34_new.xml" test="infer_request_inference" device="GPU" vmsize="2539222" vmpeak="2611312" vmrss="2191604" vmhwm="2191604" />
+ <model path="pytorch/FP32/inceptionv3_pretrained/inceptionv3_pretrained.xml" test="create_exenetwork" device="CPU" vmsize="757878" vmpeak="1077934" vmrss="35261" vmhwm="348964" />
+ <model path="pytorch/FP32/inceptionv3_pretrained/inceptionv3_pretrained.xml" test="create_exenetwork" device="GPU" vmsize="899610" vmpeak="1179116" vmrss="553863" vmhwm="896997" />
+ <model path="pytorch/FP32/inceptionv3_pretrained/inceptionv3_pretrained.xml" test="infer_request_inference" device="CPU" vmsize="1050878" vmpeak="1077876" vmrss="256506" vmhwm="347974" />
+ <model path="pytorch/FP32/inceptionv3_pretrained/inceptionv3_pretrained.xml" test="infer_request_inference" device="GPU" vmsize="1179239" vmpeak="1251329" vmrss="826553" vmhwm="897714" />
+ <model path="pytorch/FP32/resnet50_pretrained/resnet50_pretrained.xml" test="create_exenetwork" device="CPU" vmsize="760456" vmpeak="1096708" vmrss="27315" vmhwm="361944" />
+ <model path="pytorch/FP32/resnet50_pretrained/resnet50_pretrained.xml" test="create_exenetwork" device="GPU" vmsize="834275" vmpeak="1073569" vmrss="489086" vmhwm="792343" />
+ <model path="pytorch/FP32/resnet50_pretrained/resnet50_pretrained.xml" test="infer_request_inference" device="CPU" vmsize="1058622" vmpeak="1130712" vmrss="267682" vmhwm="362749" />
+ <model path="pytorch/FP32/resnet50_pretrained/resnet50_pretrained.xml" test="infer_request_inference" device="GPU" vmsize="1050852" vmpeak="1122941" vmrss="697576" vmhwm="791040" />
+ <model path="pytorch/FP32/resnet50_torchvision/resnet50_torchvision.xml" test="create_exenetwork" device="CPU" vmsize="755950" vmpeak="1092203" vmrss="27640" vmhwm="362740" />
+ <model path="pytorch/FP32/resnet50_torchvision/resnet50_torchvision.xml" test="create_exenetwork" device="GPU" vmsize="835951" vmpeak="1073516" vmrss="490674" vmhwm="792224" />
+ <model path="pytorch/FP32/resnet50_torchvision/resnet50_torchvision.xml" test="infer_request_inference" device="CPU" vmsize="1058626" vmpeak="1130716" vmrss="266516" vmhwm="361992" />
+ <model path="pytorch/FP32/resnet50_torchvision/resnet50_torchvision.xml" test="infer_request_inference" device="GPU" vmsize="1050218" vmpeak="1071435" vmrss="696669" vmhwm="789848" />
+ <model path="pytorch/FP32/squeezenet_v1.1_pretrained/squeezenet_v1.1_pretrained.xml" test="create_exenetwork" device="CPU" vmsize="754872" vmpeak="880550" vmrss="29603" vmhwm="43212" />
+ <model path="pytorch/FP32/squeezenet_v1.1_pretrained/squeezenet_v1.1_pretrained.xml" test="create_exenetwork" device="GPU" vmsize="648881" vmpeak="743626" vmrss="303424" vmhwm="318348" />
+ <model path="pytorch/FP32/squeezenet_v1.1_pretrained/squeezenet_v1.1_pretrained.xml" test="infer_request_inference" device="CPU" vmsize="818246" vmpeak="818246" vmrss="46534" vmhwm="46534" />
+ <model path="pytorch/FP32/squeezenet_v1.1_pretrained/squeezenet_v1.1_pretrained.xml" test="infer_request_inference" device="GPU" vmsize="674146" vmpeak="746235" vmrss="320315" vmhwm="320315" />
+ <model path="tf/1.14.0/FP32/bert_base_uncased/bert_base_uncased.xml" test="create_exenetwork" device="CPU" vmsize="764755" vmpeak="2092574" vmrss="38016" vmhwm="1352450" />
+ <model path="tf/1.14.0/FP32/bert_base_uncased/bert_base_uncased.xml" test="create_exenetwork" device="GPU" vmsize="1578328" vmpeak="3355976" vmrss="1233474" vmhwm="3074953" />
+ <model path="tf/1.14.0/FP32/bert_base_uncased/bert_base_uncased.xml" test="infer_request_inference" device="CPU" vmsize="1802838" vmpeak="2092587" vmrss="994188" vmhwm="1352709" />
+ <model path="tf/1.14.0/FP32/bert_base_uncased/bert_base_uncased.xml" test="infer_request_inference" device="GPU" vmsize="2958472" vmpeak="3352694" vmrss="2607677" vmhwm="3072185" />
+ <model path="tf/1.14.0/FP32/bert_xnli/bert_xnli.xml" test="create_exenetwork" device="CPU" vmsize="765124" vmpeak="2035453" vmrss="39745" vmhwm="1292420" />
+ <model path="tf/1.14.0/FP32/bert_xnli/bert_xnli.xml" test="create_exenetwork" device="GPU" vmsize="1939801" vmpeak="3261715" vmrss="1594617" vmhwm="2980577" />
+ <model path="tf/1.14.0/FP32/bert_xnli/bert_xnli.xml" test="infer_request_inference" device="CPU" vmsize="1750196" vmpeak="2039945" vmrss="935774" vmhwm="1291963" />
+ <model path="tf/1.14.0/FP32/bert_xnli/bert_xnli.xml" test="infer_request_inference" device="GPU" vmsize="2902235" vmpeak="3265460" vmrss="2551727" vmhwm="2984352" />
+ <model path="tf/1.14.0/FP32/cmu/cmu.xml" test="create_exenetwork" device="CPU" vmsize="757587" vmpeak="1547678" vmrss="33004" vmhwm="718973" />
+ <model path="tf/1.14.0/FP32/cmu/cmu.xml" test="create_exenetwork" device="GPU" vmsize="1154670" vmpeak="1678943" vmrss="809811" vmhwm="1398284" />
+ <model path="tf/1.14.0/FP32/cmu/cmu.xml" test="infer_request_inference" device="CPU" vmsize="1553134" vmpeak="1553134" vmrss="606232" vmhwm="719791" />
+ <model path="tf/1.14.0/FP32/cmu/cmu.xml" test="infer_request_inference" device="GPU" vmsize="1753910" vmpeak="1826000" vmrss="1400234" vmhwm="1400234" />
+ <model path="tf/1.14.0/FP32/deeplab_v3/deeplab_v3.xml" test="create_exenetwork" device="CPU" vmsize="757160" vmpeak="867486" vmrss="41307" vmhwm="62678" />
+ <model path="tf/1.14.0/FP32/deeplab_v3/deeplab_v3.xml" test="create_exenetwork" device="GPU" vmsize="743283" vmpeak="841055" vmrss="398604" vmhwm="537209" />
+ <model path="tf/1.14.0/FP32/deeplab_v3/deeplab_v3.xml" test="infer_request_inference" device="CPU" vmsize="888087" vmpeak="960176" vmrss="114166" vmhwm="114166" />
+ <model path="tf/1.14.0/FP32/deeplab_v3/deeplab_v3.xml" test="infer_request_inference" device="GPU" vmsize="894339" vmpeak="966429" vmrss="541912" vmhwm="541912" />
+ <model path="tf/1.14.0/FP32/densenet_121/densenet_121.xml" test="create_exenetwork" device="CPU" vmsize="772728" vmpeak="951218" vmrss="95840" vmhwm="151676" />
+ <model path="tf/1.14.0/FP32/densenet_121/densenet_121.xml" test="create_exenetwork" device="GPU" vmsize="1135195" vmpeak="1245301" vmrss="789848" vmhwm="820410" />
+ <model path="tf/1.14.0/FP32/densenet_121/densenet_121.xml" test="infer_request_inference" device="CPU" vmsize="985450" vmpeak="1057540" vmrss="159046" vmhwm="159046" />
+ <model path="tf/1.14.0/FP32/densenet_121/densenet_121.xml" test="infer_request_inference" device="GPU" vmsize="1171152" vmpeak="1243242" vmrss="818598" vmhwm="818598" />
+ <model path="tf/1.14.0/FP32/densenet_169/densenet_169.xml" test="create_exenetwork" device="CPU" vmsize="864168" vmpeak="998263" vmrss="126266" vmhwm="241604" />
+ <model path="tf/1.14.0/FP32/densenet_169/densenet_169.xml" test="create_exenetwork" device="GPU" vmsize="1353237" vmpeak="1472583" vmrss="1007978" vmhwm="1094614" />
+ <model path="tf/1.14.0/FP32/densenet_169/densenet_169.xml" test="infer_request_inference" device="CPU" vmsize="1060316" vmpeak="1132406" vmrss="238326" vmhwm="240724" />
+ <model path="tf/1.14.0/FP32/densenet_169/densenet_169.xml" test="infer_request_inference" device="GPU" vmsize="1447146" vmpeak="1519236" vmrss="1094759" vmhwm="1097835" />
+ <model path="tf/1.14.0/FP32/dssd_avigilon/dssd_avigilon.xml" test="create_exenetwork" device="CPU" vmsize="757156" vmpeak="826843" vmrss="69031" vmhwm="100887" />
+ <model path="tf/1.14.0/FP32/dssd_avigilon/dssd_avigilon.xml" test="create_exenetwork" device="GPU" vmsize="796250" vmpeak="906813" vmrss="451171" vmhwm="482077" />
+ <model path="tf/1.14.0/FP32/dssd_avigilon/dssd_avigilon.xml" test="infer_request_inference" device="CPU" vmsize="849041" vmpeak="849041" vmrss="104464" vmhwm="104464" />
+ <model path="tf/1.14.0/FP32/dssd_avigilon/dssd_avigilon.xml" test="infer_request_inference" device="GPU" vmsize="833984" vmpeak="906074" vmrss="481786" vmhwm="481786" />
+ <model path="tf/1.14.0/FP32/facenet/facenet.xml" test="create_exenetwork" device="CPU" vmsize="760786" vmpeak="1139173" vmrss="66413" vmhwm="353346" />
+ <model path="tf/1.14.0/FP32/facenet/facenet.xml" test="create_exenetwork" device="GPU" vmsize="1055560" vmpeak="1255601" vmrss="710595" vmhwm="974815" />
+ <model path="tf/1.14.0/FP32/facenet/facenet.xml" test="infer_request_inference" device="CPU" vmsize="1097984" vmpeak="1170074" vmrss="281050" vmhwm="352228" />
+ <model path="tf/1.14.0/FP32/facenet/facenet.xml" test="infer_request_inference" device="GPU" vmsize="1259253" vmpeak="1331343" vmrss="906562" vmhwm="976483" />
+ <model path="tf/1.14.0/FP32/faster_rcnn_inception_resnet_v2_atrous_coco/faster_rcnn_inception_resnet_v2_atrous_coco.xml" test="create_exenetwork" device="CPU" vmsize="920884" vmpeak="2443892" vmrss="237186" vmhwm="851215" />
+ <model path="tf/1.14.0/FP32/faster_rcnn_inception_resnet_v2_atrous_coco/faster_rcnn_inception_resnet_v2_atrous_coco.xml" test="create_exenetwork" device="GPU" vmsize="1751376" vmpeak="4164239" vmrss="1406411" vmhwm="3883422" />
+ <model path="tf/1.14.0/FP32/faster_rcnn_inception_v2_coco/faster_rcnn_inception_v2_coco.xml" test="create_exenetwork" device="CPU" vmsize="757323" vmpeak="986519" vmrss="35006" vmhwm="212911" />
+ <model path="tf/1.14.0/FP32/faster_rcnn_inception_v2_coco/faster_rcnn_inception_v2_coco.xml" test="create_exenetwork" device="GPU" vmsize="862219" vmpeak="1179283" vmrss="516881" vmhwm="897930" />
+ <model path="tf/1.14.0/FP32/faster_rcnn_resnet101_coco/faster_rcnn_resnet101_coco.xml" test="create_exenetwork" device="CPU" vmsize="761538" vmpeak="1491811" vmrss="45667" vmhwm="671554" />
+ <model path="tf/1.14.0/FP32/faster_rcnn_resnet101_coco/faster_rcnn_resnet101_coco.xml" test="create_exenetwork" device="GPU" vmsize="1126884" vmpeak="1800550" vmrss="781739" vmhwm="1519302" />
+ <model path="tf/1.14.0/FP32/faster_rcnn_resnet50_coco/faster_rcnn_resnet50_coco.xml" test="create_exenetwork" device="CPU" vmsize="766964" vmpeak="1233342" vmrss="29568" vmhwm="415509" />
+ <model path="tf/1.14.0/FP32/faster_rcnn_resnet50_coco/faster_rcnn_resnet50_coco.xml" test="create_exenetwork" device="GPU" vmsize="897432" vmpeak="1347007" vmrss="553357" vmhwm="1067290" />
+ <model path="tf/1.14.0/FP32/i3d_rgb/i3d_rgb.xml" test="create_exenetwork" device="CPU" vmsize="756562" vmpeak="1099533" vmrss="30078" vmhwm="245590" />
+ <model path="tf/1.14.0/FP32/i3d_rgb/i3d_rgb.xml" test="create_exenetwork" device="GPU" vmsize="764170" vmpeak="1353149" vmrss="419267" vmhwm="1072244" />
+ <model path="tf/1.14.0/FP32/i3d_rgb/i3d_rgb.xml" test="infer_request_inference" device="CPU" vmsize="1478496" vmpeak="1478496" vmrss="332820" vmhwm="332820" />
+ <model path="tf/1.14.0/FP32/i3d_rgb/i3d_rgb.xml" test="infer_request_inference" device="GPU" vmsize="1423364" vmpeak="1495454" vmrss="1070973" vmhwm="1172441" />
+ <model path="tf/1.14.0/FP32/icv_squeezenet_v1.0/icv_squeezenet_v1.0.xml" test="create_exenetwork" device="CPU" vmsize="755092" vmpeak="815298" vmrss="28811" vmhwm="43687" />
+ <model path="tf/1.14.0/FP32/icv_squeezenet_v1.0/icv_squeezenet_v1.0.xml" test="create_exenetwork" device="GPU" vmsize="620734" vmpeak="715479" vmrss="274991" vmhwm="324935" />
+ <model path="tf/1.14.0/FP32/icv_squeezenet_v1.0/icv_squeezenet_v1.0.xml" test="infer_request_inference" device="CPU" vmsize="825268" vmpeak="825268" vmrss="48439" vmhwm="48439" />
+ <model path="tf/1.14.0/FP32/icv_squeezenet_v1.0/icv_squeezenet_v1.0.xml" test="infer_request_inference" device="GPU" vmsize="680592" vmpeak="752681" vmrss="326972" vmhwm="326972" />
+ <model path="tf/1.14.0/FP32/icv_squeezenet_v1.1/icv_squeezenet_v1.1.xml" test="create_exenetwork" device="CPU" vmsize="765182" vmpeak="880712" vmrss="29827" vmhwm="44149" />
+ <model path="tf/1.14.0/FP32/icv_squeezenet_v1.1/icv_squeezenet_v1.1.xml" test="create_exenetwork" device="GPU" vmsize="612620" vmpeak="707366" vmrss="266855" vmhwm="323734" />
+ <model path="tf/1.14.0/FP32/icv_squeezenet_v1.1/icv_squeezenet_v1.1.xml" test="infer_request_inference" device="CPU" vmsize="818879" vmpeak="818879" vmrss="46534" vmhwm="46534" />
+ <model path="tf/1.14.0/FP32/icv_squeezenet_v1.1/icv_squeezenet_v1.1.xml" test="infer_request_inference" device="GPU" vmsize="681010" vmpeak="753099" vmrss="326902" vmhwm="326902" />
+ <model path="tf/1.14.0/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="create_exenetwork" device="CPU" vmsize="848056" vmpeak="1522360" vmrss="147382" vmhwm="794481" />
+ <model path="tf/1.14.0/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="create_exenetwork" device="GPU" vmsize="1699992" vmpeak="2187231" vmrss="1354892" vmhwm="1906344" />
+ <model path="tf/1.14.0/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="infer_request_inference" device="CPU" vmsize="1437365" vmpeak="1522364" vmrss="643724" vmhwm="793755" />
+ <model path="tf/1.14.0/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="infer_request_inference" device="GPU" vmsize="2152515" vmpeak="2224604" vmrss="1800026" vmhwm="1900395" />
+ <model path="tf/1.14.0/FP32/inception_v1/inception_v1.xml" test="create_exenetwork" device="CPU" vmsize="757526" vmpeak="905132" vmrss="83195" vmhwm="119653" />
+ <model path="tf/1.14.0/FP32/inception_v1/inception_v1.xml" test="create_exenetwork" device="GPU" vmsize="815988" vmpeak="932663" vmrss="470742" vmhwm="507760" />
+ <model path="tf/1.14.0/FP32/inception_v1/inception_v1.xml" test="infer_request_inference" device="CPU" vmsize="1007820" vmpeak="1007820" vmrss="123926" vmhwm="123926" />
+ <model path="tf/1.14.0/FP32/inception_v1/inception_v1.xml" test="infer_request_inference" device="GPU" vmsize="861520" vmpeak="933609" vmrss="507870" vmhwm="507870" />
+ <model path="tf/1.14.0/FP32/inception_v2/inception_v2.xml" test="create_exenetwork" device="CPU" vmsize="756756" vmpeak="925425" vmrss="34007" vmhwm="180769" />
+ <model path="tf/1.14.0/FP32/inception_v2/inception_v2.xml" test="create_exenetwork" device="GPU" vmsize="824168" vmpeak="962403" vmrss="478737" vmhwm="610280" />
+ <model path="tf/1.14.0/FP32/inception_v2/inception_v2.xml" test="infer_request_inference" device="CPU" vmsize="927669" vmpeak="999759" vmrss="141772" vmhwm="181966" />
+ <model path="tf/1.14.0/FP32/inception_v2/inception_v2.xml" test="infer_request_inference" device="GPU" vmsize="936755" vmpeak="1008845" vmrss="583963" vmhwm="611516" />
+ <model path="tf/1.14.0/FP32/inception_v3/inception_v3.xml" test="create_exenetwork" device="CPU" vmsize="759013" vmpeak="1063559" vmrss="51255" vmhwm="349113" />
+ <model path="tf/1.14.0/FP32/inception_v3/inception_v3.xml" test="create_exenetwork" device="GPU" vmsize="925958" vmpeak="1184101" vmrss="580056" vmhwm="902325" />
+ <model path="tf/1.14.0/FP32/inception_v3/inception_v3.xml" test="infer_request_inference" device="CPU" vmsize="1043583" vmpeak="1115672" vmrss="263520" vmhwm="349034" />
+ <model path="tf/1.14.0/FP32/inception_v3/inception_v3.xml" test="infer_request_inference" device="GPU" vmsize="1189548" vmpeak="1261638" vmrss="836646" vmhwm="903676" />
+ <model path="tf/1.14.0/FP32/inception_v4/inception_v4.xml" test="create_exenetwork" device="CPU" vmsize="764574" vmpeak="1327493" vmrss="64108" vmhwm="603842" />
+ <model path="tf/1.14.0/FP32/inception_v4/inception_v4.xml" test="create_exenetwork" device="GPU" vmsize="1221717" vmpeak="1686643" vmrss="875617" vmhwm="1404475" />
+ <model path="tf/1.14.0/FP32/inception_v4/inception_v4.xml" test="infer_request_inference" device="CPU" vmsize="1381556" vmpeak="1403402" vmrss="440356" vmhwm="602751" />
+ <model path="tf/1.14.0/FP32/inception_v4/inception_v4.xml" test="infer_request_inference" device="GPU" vmsize="1641921" vmpeak="1714011" vmrss="1289340" vmhwm="1405430" />
+ <model path="tf/1.14.0/FP32/mask_rcnn_resnet101_atrous_coco/mask_rcnn_resnet101_atrous_coco.xml" test="create_exenetwork" device="CPU" vmsize="762119" vmpeak="2738828" vmrss="47203" vmhwm="947557" />
+ <model path="tf/1.14.0/FP32/mask_rcnn_resnet101_atrous_coco/mask_rcnn_resnet101_atrous_coco.xml" test="create_exenetwork" device="GPU" vmsize="1295483" vmpeak="4189812" vmrss="949788" vmhwm="3908550" />
+ <model path="tf/1.14.0/FP32/mobilenet_v1_0.25_128/mobilenet_v1_0.25_128.xml" test="create_exenetwork" device="CPU" vmsize="763840" vmpeak="805556" vmrss="21938" vmhwm="33264" />
+ <model path="tf/1.14.0/FP32/mobilenet_v1_0.25_128/mobilenet_v1_0.25_128.xml" test="create_exenetwork" device="GPU" vmsize="652572" vmpeak="744180" vmrss="306754" vmhwm="318432" />
+ <model path="tf/1.14.0/FP32/mobilenet_v1_0.25_128/mobilenet_v1_0.25_128.xml" test="infer_request_inference" device="CPU" vmsize="814000" vmpeak="814000" vmrss="33391" vmhwm="33391" />
+ <model path="tf/1.14.0/FP32/mobilenet_v1_0.25_128/mobilenet_v1_0.25_128.xml" test="infer_request_inference" device="GPU" vmsize="672144" vmpeak="744233" vmrss="319026" vmhwm="319026" />
+ <model path="tf/1.14.0/FP32/mobilenet_v1_0.5_160/mobilenet_v1_0.5_160.xml" test="create_exenetwork" device="CPU" vmsize="754705" vmpeak="881188" vmrss="29282" vmhwm="44836" />
+ <model path="tf/1.14.0/FP32/mobilenet_v1_0.5_160/mobilenet_v1_0.5_160.xml" test="create_exenetwork" device="GPU" vmsize="614209" vmpeak="709759" vmrss="268778" vmhwm="326845" />
+ <model path="tf/1.14.0/FP32/mobilenet_v1_0.5_160/mobilenet_v1_0.5_160.xml" test="infer_request_inference" device="CPU" vmsize="818228" vmpeak="890318" vmrss="45513" vmhwm="45513" />
+ <model path="tf/1.14.0/FP32/mobilenet_v1_0.5_160/mobilenet_v1_0.5_160.xml" test="infer_request_inference" device="GPU" vmsize="682484" vmpeak="754573" vmrss="328966" vmhwm="328966" />
+ <model path="tf/1.14.0/FP32/mobilenet_v1_1.0_224/mobilenet_v1_1.0_224.xml" test="create_exenetwork" device="CPU" vmsize="754903" vmpeak="821928" vmrss="55237" vmhwm="82768" />
+ <model path="tf/1.14.0/FP32/mobilenet_v1_1.0_224/mobilenet_v1_1.0_224.xml" test="create_exenetwork" device="GPU" vmsize="643887" vmpeak="751788" vmrss="298685" vmhwm="367602" />
+ <model path="tf/1.14.0/FP32/mobilenet_v1_1.0_224/mobilenet_v1_1.0_224.xml" test="infer_request_inference" device="CPU" vmsize="831111" vmpeak="831111" vmrss="86732" vmhwm="86732" />
+ <model path="tf/1.14.0/FP32/mobilenet_v1_1.0_224/mobilenet_v1_1.0_224.xml" test="infer_request_inference" device="GPU" vmsize="720979" vmpeak="793069" vmrss="367584" vmhwm="367584" />
+ <model path="tf/1.14.0/FP32/mobilenet_v2_1.0_224/mobilenet_v2_1.0_224.xml" test="create_exenetwork" device="CPU" vmsize="756870" vmpeak="819759" vmrss="54586" vmhwm="78570" />
+ <model path="tf/1.14.0/FP32/mobilenet_v2_1.0_224/mobilenet_v2_1.0_224.xml" test="create_exenetwork" device="GPU" vmsize="705724" vmpeak="809490" vmrss="360267" vmhwm="435512" />
+ <model path="tf/1.14.0/FP32/mobilenet_v2_1.0_224/mobilenet_v2_1.0_224.xml" test="infer_request_inference" device="CPU" vmsize="835978" vmpeak="835978" vmrss="82583" vmhwm="82583" />
+ <model path="tf/1.14.0/FP32/mobilenet_v2_1.0_224/mobilenet_v2_1.0_224.xml" test="infer_request_inference" device="GPU" vmsize="788902" vmpeak="860992" vmrss="435727" vmhwm="435727" />
+ <model path="tf/1.14.0/FP32/mobilenet_v2_1.4_224/mobilenet_v2_1.4_224.xml" test="create_exenetwork" device="CPU" vmsize="756725" vmpeak="831080" vmrss="76414" vmhwm="111914" />
+ <model path="tf/1.14.0/FP32/mobilenet_v2_1.4_224/mobilenet_v2_1.4_224.xml" test="create_exenetwork" device="GPU" vmsize="787058" vmpeak="902290" vmrss="441399" vmhwm="476911" />
+ <model path="tf/1.14.0/FP32/mobilenet_v2_1.4_224/mobilenet_v2_1.4_224.xml" test="infer_request_inference" device="CPU" vmsize="847299" vmpeak="847299" vmrss="120969" vmhwm="120969" />
+ <model path="tf/1.14.0/FP32/mobilenet_v2_1.4_224/mobilenet_v2_1.4_224.xml" test="infer_request_inference" device="GPU" vmsize="828920" vmpeak="901010" vmrss="475939" vmhwm="475939" />
+ <model path="tf/1.14.0/FP32/ncf/ncf.xml" test="create_exenetwork" device="CPU" vmsize="760988" vmpeak="1018754" vmrss="14484" vmhwm="296612" />
+ <model path="tf/1.14.0/FP32/ncf/ncf.xml" test="create_exenetwork" device="GPU" vmsize="600859" vmpeak="965967" vmrss="255569" vmhwm="685150" />
+ <model path="tf/1.14.0/FP32/ncf/ncf.xml" test="infer_request_inference" device="CPU" vmsize="1095155" vmpeak="1167245" vmrss="304607" vmhwm="304607" />
+ <model path="tf/1.14.0/FP32/ncf/ncf.xml" test="infer_request_inference" device="GPU" vmsize="1004577" vmpeak="1076666" vmrss="651943" vmhwm="689915" />
+ <model path="tf/1.14.0/FP32/resnet_v1.5_50/resnet_v1.5_50.xml" test="create_exenetwork" device="CPU" vmsize="756096" vmpeak="1100136" vmrss="27812" vmhwm="362344" />
+ <model path="tf/1.14.0/FP32/resnet_v1.5_50/resnet_v1.5_50.xml" test="create_exenetwork" device="GPU" vmsize="822830" vmpeak="1073947" vmrss="477193" vmhwm="792264" />
+ <model path="tf/1.14.0/FP32/resnet_v1.5_50/resnet_v1.5_50.xml" test="infer_request_inference" device="CPU" vmsize="1060571" vmpeak="1132661" vmrss="269808" vmhwm="362771" />
+ <model path="tf/1.14.0/FP32/resnet_v1.5_50/resnet_v1.5_50.xml" test="infer_request_inference" device="GPU" vmsize="1054684" vmpeak="1075272" vmrss="702310" vmhwm="794314" />
+ <model path="tf/1.14.0/FP32/resnet_v1_101/resnet_v1_101.xml" test="create_exenetwork" device="CPU" vmsize="760764" vmpeak="1338383" vmrss="42706" vmhwm="617047" />
+ <model path="tf/1.14.0/FP32/resnet_v1_101/resnet_v1_101.xml" test="create_exenetwork" device="GPU" vmsize="1108602" vmpeak="1561885" vmrss="762616" vmhwm="1279700" />
+ <model path="tf/1.14.0/FP32/resnet_v1_101/resnet_v1_101.xml" test="infer_request_inference" device="CPU" vmsize="1279819" vmpeak="1338409" vmrss="435102" vmhwm="617865" />
+ <model path="tf/1.14.0/FP32/resnet_v1_101/resnet_v1_101.xml" test="infer_request_inference" device="GPU" vmsize="1455146" vmpeak="1561388" vmrss="1101755" vmhwm="1279845" />
+ <model path="tf/1.14.0/FP32/resnet_v1_152/resnet_v1_152.xml" test="create_exenetwork" device="CPU" vmsize="765221" vmpeak="1552262" vmrss="59875" vmhwm="829250" />
+ <model path="tf/1.14.0/FP32/resnet_v1_152/resnet_v1_152.xml" test="create_exenetwork" device="GPU" vmsize="1322098" vmpeak="1985359" vmrss="976223" vmhwm="1703319" />
+ <model path="tf/1.14.0/FP32/resnet_v1_152/resnet_v1_152.xml" test="infer_request_inference" device="CPU" vmsize="1373006" vmpeak="1552293" vmrss="581891" vmhwm="829848" />
+ <model path="tf/1.14.0/FP32/resnet_v1_152/resnet_v1_152.xml" test="infer_request_inference" device="GPU" vmsize="1814348" vmpeak="1986380" vmrss="1461099" vmhwm="1704714" />
+ <model path="tf/1.14.0/FP32/resnet_v1_50/resnet_v1_50.xml" test="create_exenetwork" device="CPU" vmsize="766088" vmpeak="1079958" vmrss="27324" vmhwm="362155" />
+ <model path="tf/1.14.0/FP32/resnet_v1_50/resnet_v1_50.xml" test="create_exenetwork" device="GPU" vmsize="838965" vmpeak="1085884" vmrss="493407" vmhwm="804324" />
+ <model path="tf/1.14.0/FP32/resnet_v1_50/resnet_v1_50.xml" test="infer_request_inference" device="CPU" vmsize="1046157" vmpeak="1118246" vmrss="260515" vmhwm="362810" />
+ <model path="tf/1.14.0/FP32/resnet_v1_50/resnet_v1_50.xml" test="infer_request_inference" device="GPU" vmsize="1057223" vmpeak="1080772" vmrss="704066" vmhwm="799440" />
+ <model path="tf/1.14.0/FP32/resnet_v2_101/resnet_v2_101.xml" test="create_exenetwork" device="CPU" vmsize="761754" vmpeak="1365104" vmrss="45179" vmhwm="620879" />
+ <model path="tf/1.14.0/FP32/resnet_v2_101/resnet_v2_101.xml" test="create_exenetwork" device="GPU" vmsize="1120737" vmpeak="1613546" vmrss="774637" vmhwm="1331308" />
+ <model path="tf/1.14.0/FP32/resnet_v2_101/resnet_v2_101.xml" test="infer_request_inference" device="CPU" vmsize="1251346" vmpeak="1365135" vmrss="446415" vmhwm="620241" />
+ <model path="tf/1.14.0/FP32/resnet_v2_101/resnet_v2_101.xml" test="infer_request_inference" device="GPU" vmsize="1515817" vmpeak="1613858" vmrss="1162572" vmhwm="1331968" />
+ <model path="tf/1.14.0/FP32/resnet_v2_152/resnet_v2_152.xml" test="create_exenetwork" device="CPU" vmsize="839823" vmpeak="1569361" vmrss="155029" vmhwm="833157" />
+ <model path="tf/1.14.0/FP32/resnet_v2_152/resnet_v2_152.xml" test="create_exenetwork" device="GPU" vmsize="1363960" vmpeak="2068752" vmrss="1018507" vmhwm="1787042" />
+ <model path="tf/1.14.0/FP32/resnet_v2_152/resnet_v2_152.xml" test="infer_request_inference" device="CPU" vmsize="1476041" vmpeak="1569392" vmrss="679918" vmhwm="833914" />
+ <model path="tf/1.14.0/FP32/resnet_v2_152/resnet_v2_152.xml" test="infer_request_inference" device="GPU" vmsize="1904799" vmpeak="2060317" vmrss="1551756" vmhwm="1778167" />
+ <model path="tf/1.14.0/FP32/resnet_v2_50/resnet_v2_50.xml" test="create_exenetwork" device="CPU" vmsize="756602" vmpeak="1096774" vmrss="28393" vmhwm="363391" />
+ <model path="tf/1.14.0/FP32/resnet_v2_50/resnet_v2_50.xml" test="create_exenetwork" device="GPU" vmsize="845226" vmpeak="1103374" vmrss="500051" vmhwm="821986" />
+ <model path="tf/1.14.0/FP32/resnet_v2_50/resnet_v2_50.xml" test="infer_request_inference" device="CPU" vmsize="1063304" vmpeak="1135393" vmrss="271220" vmhwm="364399" />
+ <model path="tf/1.14.0/FP32/resnet_v2_50/resnet_v2_50.xml" test="infer_request_inference" device="GPU" vmsize="1092159" vmpeak="1105997" vmrss="738276" vmhwm="823983" />
+ <model path="tf/1.14.0/FP32/rfcn_resnet101_coco/rfcn_resnet101_coco.xml" test="create_exenetwork" device="CPU" vmsize="838816" vmpeak="1561762" vmrss="116930" vmhwm="752906" />
+ <model path="tf/1.14.0/FP32/rfcn_resnet101_coco/rfcn_resnet101_coco.xml" test="create_exenetwork" device="GPU" vmsize="1674490" vmpeak="2318250" vmrss="1329842" vmhwm="2034986" />
+ <model path="tf/1.14.0/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="create_exenetwork" device="CPU" vmsize="755062" vmpeak="880739" vmrss="28415" vmhwm="43480" />
+ <model path="tf/1.14.0/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="create_exenetwork" device="GPU" vmsize="609298" vmpeak="704044" vmrss="263868" vmhwm="323488" />
+ <model path="tf/1.14.0/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="infer_request_inference" device="CPU" vmsize="825048" vmpeak="897138" vmrss="49108" vmhwm="49108" />
+ <model path="tf/1.14.0/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="infer_request_inference" device="GPU" vmsize="675844" vmpeak="747934" vmrss="322753" vmhwm="322753" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml" test="create_exenetwork" device="CPU" vmsize="756804" vmpeak="978252" vmrss="70514" vmhwm="120370" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml" test="create_exenetwork" device="GPU" vmsize="831318" vmpeak="949744" vmrss="485619" vmhwm="524550" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml" test="infer_request_inference" device="CPU" vmsize="925689" vmpeak="997779" vmrss="130244" vmhwm="130244" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml" test="infer_request_inference" device="GPU" vmsize="878099" vmpeak="950188" vmrss="525395" vmhwm="525395" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco/ssd_mobilenet_v1_fpn_coco.xml" test="create_exenetwork" device="CPU" vmsize="759435" vmpeak="1442861" vmrss="34680" vmhwm="509454" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco/ssd_mobilenet_v1_fpn_coco.xml" test="create_exenetwork" device="GPU" vmsize="1012906" vmpeak="1460487" vmrss="667977" vmhwm="1179833" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco/ssd_mobilenet_v1_fpn_coco.xml" test="infer_request_inference" device="CPU" vmsize="1368043" vmpeak="1442861" vmrss="427737" vmhwm="509533" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco/ssd_mobilenet_v1_fpn_coco.xml" test="infer_request_inference" device="GPU" vmsize="1542648" vmpeak="1542648" vmrss="1195304" vmhwm="1195304" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco_602x602/ssd_mobilenet_v1_fpn_coco_602x602.xml" test="create_exenetwork" device="CPU" vmsize="759558" vmpeak="1426185" vmrss="33862" vmhwm="507768" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco_602x602/ssd_mobilenet_v1_fpn_coco_602x602.xml" test="create_exenetwork" device="GPU" vmsize="1010358" vmpeak="1414454" vmrss="665451" vmhwm="1133941" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco_602x602/ssd_mobilenet_v1_fpn_coco_602x602.xml" test="infer_request_inference" device="CPU" vmsize="1350650" vmpeak="1426185" vmrss="421828" vmhwm="509168" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco_602x602/ssd_mobilenet_v1_fpn_coco_602x602.xml" test="infer_request_inference" device="GPU" vmsize="1493681" vmpeak="1565770" vmrss="1145416" vmhwm="1145416" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v2_coco/ssd_mobilenet_v2_coco.xml" test="create_exenetwork" device="CPU" vmsize="761433" vmpeak="985784" vmrss="41514" vmhwm="254610" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v2_coco/ssd_mobilenet_v2_coco.xml" test="create_exenetwork" device="GPU" vmsize="876933" vmpeak="1078919" vmrss="531814" vmhwm="798001" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v2_coco/ssd_mobilenet_v2_coco.xml" test="infer_request_inference" device="CPU" vmsize="1028508" vmpeak="1064698" vmrss="201212" vmhwm="254390" />
+ <model path="tf/1.14.0/FP32/ssd_mobilenet_v2_coco/ssd_mobilenet_v2_coco.xml" test="infer_request_inference" device="GPU" vmsize="1091807" vmpeak="1163896" vmrss="739525" vmhwm="798023" />
+ <model path="tf/1.14.0/FP32/unet2d/unet2d.xml" test="create_exenetwork" device="CPU" vmsize="754067" vmpeak="1169247" vmrss="15686" vmhwm="429523" />
+ <model path="tf/1.14.0/FP32/unet2d/unet2d.xml" test="create_exenetwork" device="GPU" vmsize="682413" vmpeak="1130109" vmrss="337194" vmhwm="848733" />
+ <model path="tf/1.14.0/FP32/unet2d/unet2d.xml" test="infer_request_inference" device="CPU" vmsize="1106463" vmpeak="1178553" vmrss="321428" vmhwm="429871" />
+ <model path="tf/1.14.0/FP32/unet2d/unet2d.xml" test="infer_request_inference" device="GPU" vmsize="1083904" vmpeak="1155994" vmrss="730976" vmhwm="845882" />
+ <model path="tf/1.14.0/FP32/vgg16/vgg16.xml" test="create_exenetwork" device="CPU" vmsize="754010" vmpeak="2548502" vmrss="15452" vmhwm="1807863" />
+ <model path="tf/1.14.0/FP32/vgg16/vgg16.xml" test="create_exenetwork" device="GPU" vmsize="686602" vmpeak="3327385" vmrss="340982" vmhwm="3045398" />
+ <model path="tf/1.14.0/FP32/vgg16/vgg16.xml" test="infer_request_inference" device="CPU" vmsize="2026776" vmpeak="2548502" vmrss="1241011" vmhwm="1808730" />
+ <model path="tf/1.14.0/FP32/vgg16/vgg16.xml" test="infer_request_inference" device="GPU" vmsize="2438568" vmpeak="3312188" vmrss="2084328" vmhwm="3029980" />
+ <model path="tf/1.14.0/FP32/vgg19/vgg19.xml" test="create_exenetwork" device="CPU" vmsize="754168" vmpeak="2617986" vmrss="16073" vmhwm="1877000" />
+ <model path="tf/1.14.0/FP32/vgg19/vgg19.xml" test="create_exenetwork" device="GPU" vmsize="612194" vmpeak="3415310" vmrss="266732" vmhwm="3133363" />
+ <model path="tf/1.14.0/FP32/vgg19/vgg19.xml" test="infer_request_inference" device="CPU" vmsize="2145479" vmpeak="2617885" vmrss="1287272" vmhwm="1877568" />
+ <model path="tf/1.14.0/FP32/vgg19/vgg19.xml" test="infer_request_inference" device="GPU" vmsize="2521367" vmpeak="3415297" vmrss="2167426" vmhwm="3133059" />
+ <model path="tf/1.14.0/FP32/yolo_v2/yolo_v2.xml" test="create_exenetwork" device="CPU" vmsize="754344" vmpeak="1426625" vmrss="17173" vmhwm="684173" />
+ <model path="tf/1.14.0/FP32/yolo_v2/yolo_v2.xml" test="create_exenetwork" device="GPU" vmsize="684424" vmpeak="1460949" vmrss="339600" vmhwm="1180036" />
+ <model path="tf/1.14.0/FP32/yolo_v2/yolo_v2.xml" test="infer_request_inference" device="CPU" vmsize="1282802" vmpeak="1426625" vmrss="493737" vmhwm="684802" />
+ <model path="tf/1.14.0/FP32/yolo_v2/yolo_v2.xml" test="infer_request_inference" device="GPU" vmsize="1331783" vmpeak="1443006" vmrss="978560" vmhwm="1161124" />
+ <model path="tf/1.14.0/FP32/yolo_v2_tiny_voc/yolo_v2_tiny_voc.xml" test="create_exenetwork" device="CPU" vmsize="753724" vmpeak="954421" vmrss="14414" vmhwm="229578" />
+ <model path="tf/1.14.0/FP32/yolo_v2_tiny_voc/yolo_v2_tiny_voc.xml" test="create_exenetwork" device="GPU" vmsize="569179" vmpeak="816648" vmrss="224250" vmhwm="535449" />
+ <model path="tf/1.14.0/FP32/yolo_v2_tiny_voc/yolo_v2_tiny_voc.xml" test="infer_request_inference" device="CPU" vmsize="960810" vmpeak="960810" vmrss="174231" vmhwm="229807" />
+ <model path="tf/1.14.0/FP32/yolo_v2_tiny_voc/yolo_v2_tiny_voc.xml" test="infer_request_inference" device="GPU" vmsize="808627" vmpeak="880717" vmrss="455677" vmhwm="533002" />
+ <model path="tf/1.14.0/FP32/yolo_v2_voc/yolo_v2_voc.xml" test="create_exenetwork" device="CPU" vmsize="754344" vmpeak="1422647" vmrss="17437" vmhwm="680666" />
+ <model path="tf/1.14.0/FP32/yolo_v2_voc/yolo_v2_voc.xml" test="create_exenetwork" device="GPU" vmsize="686316" vmpeak="1436296" vmrss="340586" vmhwm="1154617" />
+ <model path="tf/1.14.0/FP32/yolo_v2_voc/yolo_v2_voc.xml" test="infer_request_inference" device="CPU" vmsize="1279797" vmpeak="1422616" vmrss="490982" vmhwm="680147" />
+ <model path="tf/1.14.0/FP32/yolo_v2_voc/yolo_v2_voc.xml" test="infer_request_inference" device="GPU" vmsize="1330780" vmpeak="1442570" vmrss="978392" vmhwm="1161490" />
+ <model path="tf/1.14.0/FP32/yolo_v3/yolo_v3.xml" test="create_exenetwork" device="CPU" vmsize="756958" vmpeak="1587260" vmrss="31108" vmhwm="836506" />
+ <model path="tf/1.14.0/FP32/yolo_v3/yolo_v3.xml" test="create_exenetwork" device="GPU" vmsize="1163712" vmpeak="1824596" vmrss="819011" vmhwm="1543559" />
+ <model path="tf/1.14.0/FP32/yolo_v3/yolo_v3.xml" test="infer_request_inference" device="CPU" vmsize="1405879" vmpeak="1591766" vmrss="610302" vmhwm="836594" />
+ <model path="tf/1.14.0/FP32/yolo_v3/yolo_v3.xml" test="infer_request_inference" device="GPU" vmsize="1734233" vmpeak="1823470" vmrss="1381925" vmhwm="1542178" />
+ <model path="tf/1.14.0/FP32/yolo_v3_tiny/yolo_v3_tiny.xml" test="create_exenetwork" device="CPU" vmsize="753975" vmpeak="895633" vmrss="15637" vmhwm="140927" />
+ <model path="tf/1.14.0/FP32/yolo_v3_tiny/yolo_v3_tiny.xml" test="create_exenetwork" device="GPU" vmsize="599332" vmpeak="728939" vmrss="254029" vmhwm="412566" />
+ <model path="tf/1.14.0/FP32/yolo_v3_tiny/yolo_v3_tiny.xml" test="infer_request_inference" device="CPU" vmsize="903469" vmpeak="975559" vmrss="116124" vmhwm="141182" />
+ <model path="tf/1.14.0/FP32/yolo_v3_tiny/yolo_v3_tiny.xml" test="infer_request_inference" device="GPU" vmsize="741738" vmpeak="813828" vmrss="389259" vmhwm="413476" />
+ </models>
+</attributes>
\ No newline at end of file
--- /dev/null
+<?xml version="1.0"?>
+<attributes>
+ <devices>
+ <value>CPU</value>
+ <value>GPU</value>
+ </devices>
+ <models>
+ <value>caffe/FP32/alexnet/alexnet.xml</value>
+ <value>caffe/FP32/caffenet/caffenet.xml</value>
+ <value>caffe/FP32/densenet_121/densenet_121.xml</value>
+ <value>caffe/FP32/densenet_161/densenet_161.xml</value>
+ <value>caffe/FP32/densenet_169/densenet_169.xml</value>
+ <value>caffe/FP32/densenet_201/densenet_201.xml</value>
+ <value>caffe/FP32/dpn_92/dpn_92.xml</value>
+ <value>caffe/FP32/inception_resnet_v2/inception_resnet_v2.xml</value>
+ <value>caffe/FP32/inception_v1/inception_v1.xml</value>
+ <value>caffe/FP32/inception_v2/inception_v2.xml</value>
+ <value>caffe/FP32/inception_v3/inception_v3.xml</value>
+ <value>caffe/FP32/inception_v4/inception_v4.xml</value>
+ <value>caffe/FP32/lenet/lenet.xml</value>
+ <value>caffe/FP32/mobilenet/mobilenet.xml</value>
+ <value>caffe/FP32/mobilenet_v2/mobilenet_v2.xml</value>
+ <value>caffe/FP32/resnet_18/resnet_18.xml</value>
+ <value>caffe/FP32/resnet_v1_50/resnet_v1_50.xml</value>
+ <value>caffe/FP32/resnet_v1_101/resnet_v1_101.xml</value>
+ <value>caffe/FP32/resnet_v1_152/resnet_v1_152.xml</value>
+ <value>caffe/FP32/resnet_v1_269/resnet_v1_269.xml</value>
+ <value>caffe/FP32/se_resnext_50/se_resnext_50.xml</value>
+ <value>caffe/FP32/squeezenet_v1.0/squeezenet_v1.0.xml</value>
+ <value>caffe/FP32/squeezenet_v1.1/squeezenet_v1.1.xml</value>
+ <value>caffe/FP32/ssd_googlenet/ssd_googlenet.xml</value>
+ <value>caffe/FP32/ssd_squeezenet/ssd_squeezenet.xml</value>
+ <value>caffe/FP32/ssd_mobilenet/ssd_mobilenet.xml</value>
+ <value>caffe/FP32/ssd_vgg16_300/ssd_vgg16_300.xml</value>
+ <value>caffe/FP32/ssd_vgg16_512/ssd_vgg16_512.xml</value>
+ <value>caffe/FP32/vgg16/vgg16.xml</value>
+ <value>caffe/FP32/vgg19/vgg19.xml</value>
+ <value>caffe/FP32/wrn_50_2/wrn_50_2.xml</value>
+ <value>caffe/FP32/yolo_v1_full/yolo_v1_full.xml</value>
+ <value>caffe/FP32/yolo_v1_tiny/yolo_v1_tiny.xml</value>
+ <value>caffe/FP32/yolo_v2/yolo_v2.xml</value>
+ <value>caffe/FP32/yolo_v2_tiny/yolo_v2_tiny.xml</value>
+ <value>caffe/FP32/yolo_v3/yolo_v3.xml</value>
+ <value>caffe/FP32/dilation/dilation.xml</value>
+ <value>caffe/FP32/dssd/dssd.xml</value>
+ <value>caffe/FP32/fcn8/fcn8.xml</value>
+ <value>caffe/FP32/fcn32/fcn32.xml</value>
+ <value>caffe/FP32/fcn_alexnet/fcn_alexnet.xml</value>
+ <value>caffe/FP32/mtcnn_p/mtcnn_p.xml</value>
+ <value>caffe/FP32/mtcnn_r/mtcnn_r.xml</value>
+ <value>caffe/FP32/mtcnn_o/mtcnn_o.xml</value>
+ <value>caffe/FP32/openpose_face/openpose_face.xml</value>
+ <value>caffe/FP32/openpose_hand/openpose_hand.xml</value>
+ <value>caffe/FP32/openpose_pose_coco/openpose_pose_coco.xml</value>
+ <value>caffe/FP32/places205_alexnet/places205_alexnet.xml</value>
+ <value>caffe/FP32/places205_googlenet/places205_googlenet.xml</value>
+ <value>caffe/FP32/se_bn_inception/se_bn_inception.xml</value>
+ <value>caffe/FP32/vnect/vnect.xml</value>
+ <value>tf/1.14.0/FP32/bert_base_uncased/bert_base_uncased.xml</value>
+ <value>tf/1.14.0/FP32/bert_xnli/bert_xnli.xml</value>
+ <value>tf/1.14.0/FP32/cmu/cmu.xml</value>
+ <value>tf/1.14.0/FP32/densenet_121/densenet_121.xml</value>
+ <value>tf/1.14.0/FP32/densenet_169/densenet_169.xml</value>
+ <value>tf/1.14.0/FP32/deeplab_v3/deeplab_v3.xml</value>
+ <value>tf/1.14.0/FP32/east/east.xml</value>
+ <value>tf/1.14.0/FP32/facenet/facenet.xml</value>
+ <value>tf/1.14.0/FP32/faster_rcnn_inception_v2_coco/faster_rcnn_inception_v2_coco.xml</value>
+ <value>tf/1.14.0/FP32/faster_rcnn_inception_resnet_v2_atrous_coco/faster_rcnn_inception_resnet_v2_atrous_coco.xml</value>
+ <value>tf/1.14.0/FP32/faster_rcnn_resnet50_coco/faster_rcnn_resnet50_coco.xml</value>
+ <value>tf/1.14.0/FP32/faster_rcnn_resnet101_coco/faster_rcnn_resnet101_coco.xml</value>
+ <value>tf/1.14.0/FP32/gnmt/gnmt.xml</value>
+ <value>tf/1.14.0/FP32/i3d_rgb/i3d_rgb.xml</value>
+ <value>tf/1.14.0/FP32/inception_v1/inception_v1.xml</value>
+ <value>tf/1.14.0/FP32/inception_v2/inception_v2.xml</value>
+ <value>tf/1.14.0/FP32/inception_v3/inception_v3.xml</value>
+ <value>tf/1.14.0/FP32/inception_v4/inception_v4.xml</value>
+ <value>tf/1.14.0/FP32/inception_resnet_v2/inception_resnet_v2.xml</value>
+ <value>tf/1.14.0/FP32/mask_rcnn_resnet101_atrous_coco/mask_rcnn_resnet101_atrous_coco.xml</value>
+ <value>tf/1.14.0/FP32/mobilenet_v1_0.25_128/mobilenet_v1_0.25_128.xml</value>
+ <value>tf/1.14.0/FP32/mobilenet_v1_0.5_160/mobilenet_v1_0.5_160.xml</value>
+ <value>tf/1.14.0/FP32/mobilenet_v1_1.0_224/mobilenet_v1_1.0_224.xml</value>
+ <value>tf/1.14.0/FP32/mobilenet_v2_1.0_224/mobilenet_v2_1.0_224.xml</value>
+ <value>tf/1.14.0/FP32/mobilenet_v2_1.4_224/mobilenet_v2_1.4_224.xml</value>
+ <value>tf/1.14.0/FP32/ncf/ncf.xml</value>
+ <value>tf/1.14.0/FP32/nasnet-a_large/nasnet-a_large.xml</value>
+ <value>tf/1.14.0/FP32/nasnet-a_mobile/nasnet-a_mobile.xml</value>
+ <value>tf/1.14.0/FP32/pnasnet-5_large/pnasnet-5_large.xml</value>
+ <value>tf/1.14.0/FP32/resnet_v1_50/resnet_v1_50.xml</value>
+ <value>tf/1.14.0/FP32/resnet_v1.5_50/resnet_v1.5_50.xml</value>
+ <value>tf/1.14.0/FP32/resnet_v1_101/resnet_v1_101.xml</value>
+ <value>tf/1.14.0/FP32/resnet_v1_152/resnet_v1_152.xml</value>
+ <value>tf/1.14.0/FP32/resnet_v2_50/resnet_v2_50.xml</value>
+ <value>tf/1.14.0/FP32/resnet_v2_101/resnet_v2_101.xml</value>
+ <value>tf/1.14.0/FP32/resnet_v2_152/resnet_v2_152.xml</value>
+ <value>tf/1.14.0/FP32/rfcn_resnet101_coco/rfcn_resnet101_coco.xml</value>
+ <value>tf/1.14.0/FP32/squeezenet_v1.1/squeezenet_v1.1.xml</value>
+ <value>tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml</value>
+ <value>tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco/ssd_mobilenet_v1_fpn_coco.xml</value>
+ <value>tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco_602x602/ssd_mobilenet_v1_fpn_coco_602x602.xml</value>
+ <value>tf/1.14.0/FP32/ssd_mobilenet_v2_coco/ssd_mobilenet_v2_coco.xml</value>
+ <value>tf/1.14.0/FP32/unet2d/unet2d.xml</value>
+ <value>tf/1.14.0/FP32/vgg16/vgg16.xml</value>
+ <value>tf/1.14.0/FP32/vgg19/vgg19.xml</value>
+ <value>tf/1.14.0/FP32/yolo_v2/yolo_v2.xml</value>
+ <value>tf/1.14.0/FP32/yolo_v2_voc/yolo_v2_voc.xml</value>
+ <value>tf/1.14.0/FP32/yolo_v2_tiny_voc/yolo_v2_tiny_voc.xml</value>
+ <value>tf/1.14.0/FP32/yolo_v3/yolo_v3.xml</value>
+ <value>tf/1.14.0/FP32/yolo_v3_tiny/yolo_v3_tiny.xml</value>
+ <value>tf/1.14.0/FP32/dssd_avigilon/dssd_avigilon.xml</value>
+ <value>tf/1.14.0/FP32/icv_squeezenet_v1.0/icv_squeezenet_v1.0.xml</value>
+ <value>tf/1.14.0/FP32/icv_squeezenet_v1.1/icv_squeezenet_v1.1.xml</value>
+ <value>mxnet/FP32/caffenet/caffenet.xml</value>
+ <value>mxnet/FP32/densenet_121/densenet_121.xml</value>
+ <value>mxnet/FP32/densenet_161/densenet_161.xml</value>
+ <value>mxnet/FP32/densenet_169/densenet_169.xml</value>
+ <value>mxnet/FP32/densenet_201/densenet_201.xml</value>
+ <value>mxnet/FP32/inception_v3/inception_v3.xml</value>
+ <value>mxnet/FP32/inception_v4/inception_v4.xml</value>
+ <value>mxnet/FP32/mobilenet/mobilenet.xml</value>
+ <value>mxnet/FP32/mobilenet_v2/mobilenet_v2.xml</value>
+ <value>mxnet/FP32/resnet_v1_101/resnet_v1_101.xml</value>
+ <value>mxnet/FP32/resnet_v1_152/resnet_v1_152.xml</value>
+ <value>mxnet/FP32/resnet_v2_101/resnet_v2_101.xml</value>
+ <value>mxnet/FP32/resnet_v2_152/resnet_v2_152.xml</value>
+ <value>mxnet/FP32/resnext_101/resnext_101.xml</value>
+ <value>mxnet/FP32/squeezenet_v1.1/squeezenet_v1.1.xml</value>
+ <value>mxnet/FP32/ssd_inception_v3_512/ssd_inception_v3_512.xml</value>
+ <value>mxnet/FP32/ssd_mobilenet_512/ssd_mobilenet_512.xml</value>
+ <value>mxnet/FP32/ssd_resnet50_512/ssd_resnet50_512.xml</value>
+ <value>mxnet/FP32/ssd_vgg16_300/ssd_vgg16_300.xml</value>
+ <value>mxnet/FP32/vgg16/vgg16.xml</value>
+ <value>mxnet/FP32/vgg19/vgg19.xml</value>
+ <value>mxnet/FP32/dpn_92/dpn_92.xml</value>
+ <value>mxnet/FP32/fcn8s_vgg16/fcn8s_vgg16.xml</value>
+ <value>mxnet/FP32/full_imagenet_network/full_imagenet_network.xml</value>
+ <value>mxnet/FP32/inception_resnet_v2/inception_resnet_v2.xml</value>
+ <value>mxnet/FP32/inception_v3_no_batchnorm/inception_v3_no_batchnorm.xml</value>
+ <value>mxnet/FP32/location_net/location_net.xml</value>
+ <value>mxnet/FP32/lresnet100e/lresnet100e.xml</value>
+ <value>mxnet/FP32/mtcnn_p/mtcnn_p.xml</value>
+ <value>mxnet/FP32/mtcnn_r/mtcnn_r.xml</value>
+ <value>mxnet/FP32/mtcnn_o/mtcnn_o.xml</value>
+ <value>mxnet/FP32/nin/nin.xml</value>
+ <value>mxnet/FP32/nst_vgg19/nst_vgg19.xml</value>
+ <value>mxnet/FP32/resnext_101_64x4d/resnext_101_64x4d.xml</value>
+ <value>mxnet/FP32/yolo_v1_full/yolo_v1_full.xml</value>
+ <value>mxnet/FP32/yolo_v1_tiny/yolo_v1_tiny.xml</value>
+ <value>onnx/FP32/ssd_resnet34/ssd_resnet34.xml</value>
+ <value>onnx/FP32/ssd_resnet34_new/ssd_resnet34_new.xml</value>
+ <value>onnx/FP32/retina_net/retina_net.xml</value>
+ <value>pytorch/FP32/inceptionv3_pretrained/inceptionv3_pretrained.xml</value>
+ <value>pytorch/FP32/resnet50_pretrained/resnet50_pretrained.xml</value>
+ <value>pytorch/FP32/squeezenet_v1.1_pretrained/squeezenet_v1.1_pretrained.xml</value>
+ <value>pytorch/FP32/resnet50_torchvision/resnet50_torchvision.xml</value>
+ </models>
+</attributes>
\ No newline at end of file
--- /dev/null
+<?xml version="1.0"?>
+<attributes>
+<!--[ WARNING ] Use of attribute "processes" from config isn't implemented yet. It will be ignored.-->
+ <processes>
+ <value>1</value>
+ </processes>
+ <threads>
+ <value>1</value>
+ </threads>
+ <iterations>
+ <value>1000</value>
+ </iterations>
+ <devices>
+ <value>CPU</value>
+ <value>GPU</value>
+ </devices>
+ <models>
+ <value>caffe/FP32/alexnet/alexnet.xml</value>
+ <value>tf/1.14.0/FP32/inception_v3/inception_v3.xml</value>
+ <value>tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml</value>
+ </models>
+</attributes>
--- /dev/null
+<?xml version="1.0"?>
+<attributes>
+<!--[ WARNING ] Use of attribute "processes" from config isn't implemented yet. It will be ignored.-->
+ <processes>
+ <value>1</value>
+ </processes>
+ <threads>
+ <value>1</value>
+ </threads>
+ <iterations>
+ <value>10000</value>
+ </iterations>
+ <devices>
+ <value>CPU</value>
+ <value>GPU</value>
+ </devices>
+ <models>
+ <value>caffe/FP32/alexnet/alexnet.xml</value>
+ <value>tf/1.14.0/FP32/inception_v3/inception_v3.xml</value>
+ <value>tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml</value>
+ </models>
+</attributes>
--- /dev/null
+<?xml version="1.0"?>
+<attributes>
+ <processes>
+ <value>1</value>
+ </processes>
+ <threads>
+ <value>1</value>
+ <value>2</value>
+ </threads>
+ <iterations>
+ <value>100</value>
+ </iterations>
+ <devices>
+ <value>CPU</value>
+ <value>GPU</value>
+ </devices>
+ <models>
+ <value>caffe/FP32/alexnet/alexnet.xml</value>
+ <value>tf/1.14.0/FP32/inception_v3/inception_v3.xml</value>
+ <value>tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml</value>
+ </models>
+</attributes>
--- /dev/null
+<?xml version="1.0"?>
+<attributes>
+ <processes>
+ <value>1</value>
+ <value>2</value>
+ </processes>
+ <threads>
+ <value>1</value>
+ <value>2</value>
+ </threads>
+ <iterations>
+ <value>1000</value>
+ </iterations>
+ <devices>
+ <value>CPU</value>
+ <value>GPU</value>
+ </devices>
+ <models>
+ <value>caffe/FP32/alexnet/alexnet.xml</value>
+ <value>tf/1.14.0/FP32/inception_v3/inception_v3.xml</value>
+ <value>tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml</value>
+ </models>
+</attributes>
--- /dev/null
+# Copyright (C) 2018-2020 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+if(ENABLE_DOCKER)
+ cmake_minimum_required(VERSION 3.3 FATAL_ERROR)
+else()
+ if (APPLE)
+ # due to https://cmake.org/cmake/help/v3.12/policy/CMP0068.html
+ cmake_minimum_required(VERSION 3.9 FATAL_ERROR)
+ else()
+ cmake_minimum_required(VERSION 3.7.2 FATAL_ERROR)
+ endif()
+endif()
+
+if (CMAKE_BUILD_TYPE STREQUAL "")
+ message(STATUS "CMAKE_BUILD_TYPE not defined, 'Release' will be used")
+ set(CMAKE_BUILD_TYPE "Release")
+endif()
+
+find_package(InferenceEngineDeveloperPackage REQUIRED)
+
+add_subdirectory(unittests)
+add_subdirectory(memleaks_tests)
+add_subdirectory(memcheck_tests)
--- /dev/null
+# Stress Tests Suite
+
+This test suite contains tests evaluating the behavior of various OpenVINO use
+cases under stress conditions:
+
+- MemCheckTests measuring memory required for the use cases and fail when memory
+usage exceeds a pre-defined level.
+
+- StressMemLeaksTests ensure that the use cases does not increase memory levels
+when executing continuously.
+
+- StressUnitTests executing various Inference Engine use cases in parallel
+threads and processes.
+
+Each test refers to configuration files located in `<test dir>\local_configs`
+folder. The configuration files are installed along with tests on build time.
+
+## Getting Started
+
+Stress tests are based on the googletest framework. You can filter tests with
+`--gtest_filter` and explore tests available with `--gtest_list_tests` options.
+
+Tests measuring memory have a temporary limitation - those need to be executed
+one at a time to mitigate memory statistics pollution. You can use
+[gtest-parallel][gtest-parallel] for massive tests execution.
+
+### Pre-requisites
+
+- Linux OS to build the tests.
+
+- [gtest-parallel][gtest-parallel] to execute tests.
+
+### Building Tests
+
+Stress tests should be built in 2 steps.
+
+1. Build `dldt`
+
+Build `dldt` as usual but with `-DENABLE_TESTS=ON`.
+
+2. Build `stress_tests`
+
+Stress tests depend from the Inference Engine Developer Package located in the
+`dldt` build directory.
+
+In the command line snippet bellow, it is assumed that the Inference Engine
+Developer Package CMake module can be found in the directory `build` under
+`dldt` repository root.
+
+``` bash
+(
+export DLDT_BUILD_DIR=$(git rev-parse --show-toplevel)/build
+mkdir -p build && cd build && \
+cmake -DInferenceEngineDeveloperPackage_DIR=$DLDT_BUILD_DIR .. && make -j$(nproc) \
+)
+```
+
+### Preparing Test Data
+
+Stress test use models from [Open Model Zoo][open_model_zoo]. Download and
+convert models to IRs using `./scripts/get_testdata.py` script.
+
+From Intel network you can use models from cache at `vdp_tests` file share.
+Refer to [VDP shared folders][VDP-shared-folders] on using file shares.
+
+### Running Tests
+
+``` bash
+gtest-parallel ./MemCheckTests
+```
+
+``` bash
+gtest-parallel ./StressMemLeaksTests
+```
+MemCheckTests logs can be used to gather reference values based on current
+memory consumption:
+
+``` bash
+mkdir -p MemCheckTests-logs && \
+gtest-parallel -d ./MemCheckTests-logs ./MemCheckTests && \
+grep -rh ./MemCheckTests-logs -e ".*<model " | sed -e "s/.*<model /<model /" | sort
+```
+
+[VDP-shared-folders]: https://wiki.ith.intel.com/display/DLSDK/VDP+shared+folders
+[gtest-parallel]: https://github.com/google/gtest-parallel
+[open_model_zoo]: https://github.com/opencv/open_model_zoo
\ No newline at end of file
--- /dev/null
+#include "pipelines.h"
+#include "../utils.h"
+
+#include <iostream>
+#include <string>
+
+#include <inference_engine.hpp>
+
+using namespace InferenceEngine;
+
+std::function<void()> load_unload_plugin(const std::string &target_device) {
+ return [&] {
+ Core ie;
+ // GetVersions silently register plugin in `plugins` through `GetCPPPluginByName`
+ ie.GetVersions(target_device);
+ // Remove plugin for target_device from `plugins`
+ ie.UnregisterPlugin(target_device);
+ };
+}
+
+std::function<void()> read_network(const std::string &model) {
+ return [&] {
+ IE_SUPPRESS_DEPRECATED_START
+ CNNNetReader netReader;
+ netReader.ReadNetwork(model);
+ netReader.ReadWeights(fileNameNoExt(model) + ".bin");
+ IE_SUPPRESS_DEPRECATED_END
+ };
+}
+
+std::function<void()> create_cnnnetwork(const std::string &model) {
+ return [&] {
+ Core ie;
+ CNNNetwork cnnNetwork = ie.ReadNetwork(model);
+ };
+}
+
+std::function<void()> cnnnetwork_reshape_batch_x2(const std::string &model) {
+ return [&] {
+ Core ie;
+ CNNNetwork cnnNetwork = ie.ReadNetwork(model);
+ const InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
+ ICNNNetwork::InputShapes shapes = cnnNetwork.getInputShapes();
+ bool doReshape = false;
+ for (const InputsDataMap::value_type& input : inputInfo) {
+ int batchIndex = -1;
+ auto layout = input.second->getTensorDesc().getLayout();
+ if ((layout == Layout::NCHW) || (layout == Layout::NCDHW) ||
+ (layout == Layout::NHWC) || (layout == Layout::NDHWC) ||
+ (layout == Layout::NC)) {
+ batchIndex = 0;
+ } else if (layout == CN) {
+ batchIndex = 1;
+ }
+ if (batchIndex != -1) {
+ shapes[input.first][batchIndex] *= 2;
+ doReshape = true;
+ }
+ }
+ if (doReshape)
+ cnnNetwork.reshape(shapes);
+ else
+ throw std::logic_error("Reshape wasn't applied for a model.");
+ };
+}
+
+std::function<void()> set_input_params(const std::string &model) {
+ return [&] {
+ Core ie;
+ CNNNetwork cnnNetwork = ie.ReadNetwork(model);
+ InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
+ for (auto &input : inputInfo) {
+ input.second->getPreProcess().setResizeAlgorithm(NO_RESIZE);
+ input.second->setPrecision(Precision::U8);
+ if (input.second->getInputData()->getTensorDesc().getDims().size() == 4)
+ input.second->setLayout(Layout::NCHW);
+ else if (input.second->getInputData()->getTensorDesc().getDims().size() == 2)
+ input.second->setLayout(Layout::NC);
+ else
+ throw std::logic_error("Setting of input parameters wasn't applied for a model.");
+ }
+ };
+}
+
+std::function<void()> create_exenetwork(const std::string &model, const std::string &target_device) {
+ return [&] {
+ Core ie;
+ CNNNetwork cnnNetwork = ie.ReadNetwork(model);
+ ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, target_device);
+ };
+}
+
+std::function<void()> recreate_exenetwork(Core &ie, const std::string &model, const std::string &target_device) {
+ return [&] {
+ CNNNetwork cnnNetwork = ie.ReadNetwork(model);
+ ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, target_device);
+ };
+}
+
+std::function<void()> create_infer_request(const std::string &model, const std::string &target_device) {
+ return [&] {
+ Core ie;
+ CNNNetwork cnnNetwork = ie.ReadNetwork(model);
+ ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, target_device);
+ InferRequest infer_request = exeNetwork.CreateInferRequest();
+ };
+}
+
+
+std::function<void()> recreate_infer_request(InferenceEngine::ExecutableNetwork& exeNetwork) {
+ return [&] {
+ InferRequest infer_request = exeNetwork.CreateInferRequest();
+ };
+}
+
+std::function<void()> infer_request_inference(const std::string &model, const std::string &target_device) {
+ return [&] {
+ Core ie;
+ CNNNetwork cnnNetwork = ie.ReadNetwork(model);
+ ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, target_device);
+ InferRequest infer_request = exeNetwork.CreateInferRequest();
+ infer_request.Infer();
+ OutputsDataMap output_info(cnnNetwork.getOutputsInfo());
+ for (auto &output : output_info)
+ Blob::Ptr outputBlob = infer_request.GetBlob(output.first);
+ };
+}
+
+std::function<void()> reinfer_request_inference(InferenceEngine::InferRequest& infer_request, InferenceEngine::CNNNetwork& cnnNetwork) {
+ return [&] {
+ infer_request.Infer();
+ OutputsDataMap output_info(cnnNetwork.getOutputsInfo());
+ for (auto &output : output_info)
+ Blob::Ptr outputBlob = infer_request.GetBlob(output.first);
+ };
+}
--- /dev/null
+#include <string>
+#include <functional>
+#include <inference_engine.hpp>
+
+std::function<void()> load_unload_plugin(const std::string &target_device);
+std::function<void()> read_network(const std::string &model);
+std::function<void()> create_cnnnetwork(const std::string &model);
+std::function<void()> cnnnetwork_reshape_batch_x2(const std::string &model);
+std::function<void()> set_input_params(const std::string &model);
+std::function<void()> create_exenetwork(const std::string &model, const std::string &target_device);
+std::function<void()> recreate_exenetwork(InferenceEngine::Core &ie, const std::string &model, const std::string &target_device);
+std::function<void()> create_infer_request(const std::string &model, const std::string &target_device);
+std::function<void()> recreate_infer_request(InferenceEngine::ExecutableNetwork& exeNetwork);
+std::function<void()> infer_request_inference(const std::string &model, const std::string &target_device);
+std::function<void()> infer_request_inference(const std::string &model, const std::string &target_device);
+std::function<void()> reinfer_request_inference(InferenceEngine::InferRequest& infer_request, InferenceEngine::CNNNetwork& cnnNetwork);
--- /dev/null
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include <functional>
+
+#include "../tests_utils.h"
+
+enum ManagerStatus {
+ NOT_STARTED = -2,
+ NOT_FINISHED = -1,
+ FINISHED_SUCCESSFULLY = 0,
+ FINISHED_UNEXPECTEDLY
+};
+
+template<typename Type>
+using Task = std::pair<ManagerStatus, std::function<Type()>>;
+
+template<typename Type>
+class TaskManager {
+public:
+ std::vector<Task<Type>> tasks;
+ std::vector<Type> tasks_results;
+
+ TaskManager() {}
+
+ TaskManager(const std::initializer_list<std::function<Type()>> &tasks_list) {
+ tasks.reserve(tasks_list.size());
+ for (const auto &task : tasks_list)
+ add_task(task);
+ }
+
+ void add_task(const std::function<Type()> &task) {
+ auto _task = Task<Type>(ManagerStatus::NOT_STARTED, task);
+ tasks.push_back(_task);
+ }
+
+ void run_sequentially() {
+ // TODO: make it asynchronous
+ tasks_results.reserve(tasks.size());
+ for (auto task : tasks) {
+ task.first = ManagerStatus::NOT_FINISHED;
+ tasks_results.push_back(task.second());
+ }
+ }
+
+ void run_parallel_n_wait() {
+ run_parallel();
+ wait_all();
+ }
+
+ void wait_all() {
+ int numtasks = tasks.size();
+ for (int i = 0; i < numtasks; i++)
+ if (tasks[i].first == ManagerStatus::NOT_FINISHED)
+ wait_task(i);
+ }
+
+ std::vector<ManagerStatus> get_all_statuses() {
+ std::vector<ManagerStatus> statuses;
+
+ int numtasks = tasks.size();
+ for (int i = 0; i < numtasks; i++)
+ statuses.push_back(get_task_status(i));
+ return statuses;
+ }
+
+ std::vector<TestResult> get_all_results() {
+ return tasks_results;
+ }
+
+ TestResult get_task_result(int task_index) {
+ if (tasks_results.empty() ||
+ tasks_results.size() < task_index ||
+ task_index < 0)
+ throw std::out_of_range("Task index " + std::to_string(task_index) + " out of number of tasks");
+
+ return tasks_results[task_index];
+ }
+
+ virtual void run_parallel() = 0;
+
+ virtual void wait_task(int task_index) = 0; // TODO: implement for run_sequentially
+
+ virtual ManagerStatus get_task_status(int task_index) = 0;
+
+};
\ No newline at end of file
--- /dev/null
+#include "task_manager.h"
+
+#include <future>
+
+template <typename Type>
+class ThreadManager : public TaskManager<Type> {
+public:
+ using TaskManager<Type>::tasks;
+ using TaskManager<Type>::tasks_results;
+ std::vector<std::future<TestResult>> threads;
+
+ using TaskManager<Type>::TaskManager;
+
+ void run_parallel() final {
+ // TODO: implement run_task function according to wait_task
+ int numtasks = tasks.size();
+ threads.reserve(numtasks);
+ tasks_results.reserve(numtasks);
+
+ for (int i = 0; i < numtasks; i++)
+ if (tasks[i].first == ManagerStatus::NOT_STARTED) {
+ tasks[i].first = ManagerStatus::NOT_FINISHED;
+ threads.push_back(std::async(std::launch::async, tasks[i].second));
+ }
+ }
+
+ void wait_task(int task_index) final {
+ if (threads.empty() ||
+ threads.size() < task_index ||
+ task_index < 0)
+ throw std::out_of_range("Task index " + std::to_string(task_index) + " out of number of tasks");
+
+ try {
+ tasks_results.push_back(threads[task_index].get());
+ tasks[task_index].first = ManagerStatus::FINISHED_SUCCESSFULLY;
+ } catch (std::exception &err) { // TODO: catch any exception
+ std::exception_ptr p = std::current_exception();
+ tasks[task_index].first = ManagerStatus::FINISHED_UNEXPECTEDLY;
+ tasks_results.push_back(TestResult(TestStatus::TEST_FAILED, "Test finished unexpectedly: " + (std::string)err.what()));
+ }
+ }
+
+ ManagerStatus get_task_status(int task_index) final {
+ if (threads.empty() ||
+ threads.size() < task_index ||
+ task_index < 0)
+ throw std::out_of_range("Task index " + std::to_string(task_index) + " out of number of tasks");
+
+ return tasks[task_index].first;
+ }
+};
\ No newline at end of file
--- /dev/null
+#include "tests_utils.h"
+
+#include <gtest/gtest.h>
+#include <pugixml.hpp>
+#include <string>
+
+#define DEBUG_MODE false
+
+const pugi::xml_document & Environment::getTestConfig() {
+ return _test_config;
+}
+
+void Environment::setTestConfig(const pugi::xml_document &test_config) {
+ _test_config.reset(test_config);
+}
+
+const pugi::xml_document & Environment::getEnvConfig() {
+ return _env_config;
+}
+
+void Environment::setEnvConfig(const pugi::xml_document &env_config) {
+ _env_config.reset(env_config);
+}
+
+std::vector<TestCase> generateTestsParams(std::initializer_list<std::string> fields) {
+ std::vector<TestCase> tests_cases;
+ const pugi::xml_document & test_config = Environment::Instance().getTestConfig();
+ std::string models_path = Environment::Instance().getEnvConfig()
+ .child("attributes").child("irs_path").child("value").text().as_string();
+
+ std::vector<int> processes;
+ std::vector<int> threads;
+ std::vector<int> iterations;
+ std::vector<std::string> devices;
+ std::vector<std::string> models;
+
+ pugi::xml_node values;
+ for (auto field = fields.begin(); field != fields.end(); field++) {
+ if (*field == "processes") {
+ values = test_config.child("attributes").child("processes");
+ for (pugi::xml_node val = values.first_child(); val; val = val.next_sibling())
+ processes.push_back(val.text().as_int());
+ } else if (*field == "threads") {
+ values = test_config.child("attributes").child("threads");
+ for (pugi::xml_node val = values.first_child(); val; val = val.next_sibling())
+ threads.push_back(val.text().as_int());
+ } else if (*field == "iterations") {
+ values = test_config.child("attributes").child("iterations");
+ for (pugi::xml_node val = values.first_child(); val; val = val.next_sibling())
+ iterations.push_back(val.text().as_int());
+ } else if (*field == "devices") {
+ values = test_config.child("attributes").child("devices");
+ for (pugi::xml_node val = values.first_child(); val; val = val.next_sibling())
+ devices.push_back(val.text().as_string());
+ } else if (*field == "models") {
+ values = test_config.child("attributes").child("models");
+ for (pugi::xml_node val = values.first_child(); val; val = val.next_sibling())
+ models.push_back(val.text().as_string());
+ }
+ }
+
+ // Initialize variables with default value if it weren't filled
+ processes = !processes.empty() ? processes: std::vector<int>{1};
+ threads = !threads.empty() ? threads: std::vector<int>{1};
+ iterations = !iterations.empty() ? iterations: std::vector<int>{1};
+ devices = !devices.empty() ? devices : std::vector<std::string>{"NULL"};
+ models = !models.empty() ? models : std::vector<std::string>{"NULL"};
+
+ for (auto &numprocesses : processes)
+ for (auto &numthreads : threads)
+ for (auto &numiters : iterations)
+ for (auto &device : devices)
+ for (auto &model : models)
+ tests_cases.push_back(TestCase(numprocesses, numthreads, numiters, device, OS_PATH_JOIN({models_path, model}), model));
+
+ return tests_cases;
+}
+
+std::string getTestCaseName(const testing::TestParamInfo<TestCase> &obj) {
+ return obj.param.test_case_name;
+}
+
+void test_wrapper(const std::function<void(std::string, std::string, int)> &tests_pipeline, const TestCase ¶ms) {
+ tests_pipeline(params.model, params.device, params.numiters);
+}
+
+void _runTest(const std::function<void(std::string, std::string, int)> &tests_pipeline, const TestCase ¶ms) {
+ run_in_threads(params.numthreads, test_wrapper, tests_pipeline, params);
+}
+
+void runTest(const std::function<void(std::string, std::string, int)> &tests_pipeline, const TestCase ¶ms) {
+#if DEBUG_MODE
+ tests_pipeline(params.model, params.device, params.numiters);
+#else
+ int status = run_in_processes(params.numprocesses, _runTest, tests_pipeline, params);
+ ASSERT_EQ(status, 0) << "Test failed with exitcode " << std::to_string(status);
+#endif
+}
+
--- /dev/null
+#pragma once
+
+#include "utils.h"
+
+#include <gtest/gtest.h>
+#include <pugixml.hpp>
+#include <string>
+#include <vector>
+#include <thread>
+#include <unistd.h>
+#include <sys/wait.h>
+
+enum TestStatus
+{
+ TEST_NOT_STARTED = 0,
+ TEST_FAILED,
+ TEST_OK
+};
+
+using TestResult = std::pair<TestStatus, std::string>;
+
+class TestCase {
+public:
+ int numprocesses;
+ int numthreads;
+ int numiters;
+ std::string device;
+ std::string model_name;
+ std::string model;
+ std::string test_case_name;
+
+ TestCase(int _numprocesses, int _numthreads, int _numiters, std::string _device, const std::string& _model, const std::string& _model_name) {
+ numprocesses = _numprocesses, numthreads = _numthreads, numiters = _numiters, device = _device, model = _model, model_name = _model_name;
+ test_case_name =
+ "Numprocesses_" + std::to_string(numprocesses) + "_Numthreads_" + std::to_string(numthreads) +
+ "_Numiters_" + std::to_string(numiters) + "_Device_" + update_item_for_name(device) + "_Model_" +
+ update_item_for_name(model_name);
+ }
+
+private:
+ std::string update_item_for_name(const std::string &item) {
+ std::string _item(item);
+ for (std::string::size_type index = 0; index < _item.size(); ++index) {
+ if (!isalnum(_item[index]) && _item[index] != '_')
+ _item[index] = '_';
+ }
+ return _item;
+ }
+};
+
+class Environment {
+private:
+ pugi::xml_document _test_config;
+ pugi::xml_document _env_config;
+ Environment() = default;
+ Environment(const Environment&) = delete;
+ Environment& operator=(const Environment&) = delete;
+public:
+ static Environment& Instance(){
+ static Environment env;
+ return env;
+ }
+
+ const pugi::xml_document & getTestConfig();
+ void setTestConfig(const pugi::xml_document &test_config);
+ const pugi::xml_document & getEnvConfig();
+ void setEnvConfig(const pugi::xml_document &env_config);
+};
+
+std::vector<TestCase> generateTestsParams(std::initializer_list<std::string> items);
+std::string getTestCaseName(const testing::TestParamInfo<TestCase> &obj);
+
+void runTest(const std::function<void(std::string, std::string, int)> &tests_pipeline, const TestCase ¶ms);
+void _runTest(const std::function<void(std::string, std::string, int)> &tests_pipeline, const TestCase ¶ms);
+void test_wrapper(const std::function<void(std::string, std::string, int)> &tests_pipeline, const TestCase ¶ms);
--- /dev/null
+#include "utils.h"
+
+#include <string>
+#include <string.h>
+
+std::string OS_PATH_JOIN(std::initializer_list<std::string> list) {
+ if (!list.size())
+ return "";
+ std::string res = *list.begin();
+ for (auto it = list.begin() + 1; it != list.end(); it++) {
+ res += OS_SEP + *it;
+ }
+ return res;
+}
+
+std::string fileNameNoExt(const std::string &filepath) {
+ auto pos = filepath.rfind('.');
+ if (pos == std::string::npos) return filepath;
+ return filepath.substr(0, pos);
+}
+
+
+static size_t parseLine(char* line) {
+ // This assumes that a digit will be found and the line ends in " Kb".
+ size_t i = strlen(line);
+ const char* p = line;
+ while (*p <'0' || *p > '9') p++;
+ line[i-3] = '\0';
+ i = (size_t)atoi(p);
+ return i;
+}
+
+#ifdef _WIN32
+size_t getVmSizeInKB() {
+ // TODO rewrite for Virtual Memory
+ PROCESS_MEMORY_COUNTERS pmc;
+ pmc.cb = sizeof(PROCESS_MEMORY_COUNTERS);
+ GetProcessMemoryInfo(GetCurrentProcess(),&pmc, pmc.cb);
+ return pmc.WorkingSetSize;
+ }
+#else
+size_t getVirtualMemoryInKB(char *name){
+ FILE* file = fopen("/proc/self/status", "r");
+ size_t result = 0;
+ if (file != nullptr) {
+ char line[128];
+
+ while (fgets(line, 128, file) != NULL) {
+ if (strncmp(line, name, strlen(name)) == 0) {
+ result = parseLine(line);
+ break;
+ }
+ }
+ fclose(file);
+ }
+ return result;
+}
+
+size_t getVmSizeInKB() {return getVirtualMemoryInKB((char*) "VmSize:");}
+size_t getVmPeakInKB() {return getVirtualMemoryInKB((char*) "VmPeak:");}
+size_t getVmRSSInKB() {return getVirtualMemoryInKB((char*) "VmRSS:");}
+size_t getVmHWMInKB() {return getVirtualMemoryInKB((char*) "VmHWM:");}
+
+#endif
--- /dev/null
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include <thread>
+#include <functional>
+#include <sys/unistd.h>
+#include <sys/wait.h>
+
+#ifdef _WIN32
+#define OS_SEP std::string("\\")
+#else
+#define OS_SEP std::string("/")
+#endif
+
+
+#define log_info(str) std::cout << "[ INFO ] " << str << std::endl
+#define log_warn(str) std::cout << "[ WARNING ] " << str << std::endl
+#define log_err(str) std::cout << "[ ERROR ] " << str << std::endl
+#define log_debug(str) std::cout << "[ DEBUG ] " << str << std::endl
+
+std::string OS_PATH_JOIN(std::initializer_list<std::string> list);
+
+std::string fileNameNoExt(const std::string &filepath);
+
+#define getVmValues(vmsize, vmpeak, vmrss, vmhwm) vmsize = (long) getVmSizeInKB(); \
+ vmpeak = (long) getVmPeakInKB(); \
+ vmrss = (long) getVmRSSInKB(); \
+ vmhwm = (long) getVmHWMInKB();
+
+size_t getVmSizeInKB();
+size_t getVmPeakInKB();
+size_t getVmRSSInKB();
+size_t getVmHWMInKB();
+
+template<typename Function, typename ... Args>
+int run_in_processes(const int &numprocesses, Function const &function, Args ... args) {
+ std::vector<pid_t> child_pids(numprocesses);
+
+ for (int i = 0; i < numprocesses; i++) {
+ child_pids[i] = fork();
+ if (child_pids[i] == 0) {
+ function(args...);
+ exit(EXIT_SUCCESS);
+ }
+ }
+
+ int status = 0;
+ for (int i = 0; i < numprocesses; i++) {
+ int _status = 0;
+ waitpid(child_pids[i], &_status, WSTOPPED);
+ if (_status) {
+ log_err("Process run # " << i << " failed with exitcode " << _status);
+ status = _status;
+ }
+ }
+ return status;
+}
+
+template<typename Function, typename ... Args>
+inline void run_in_threads(const int &numthreads, Function const &function, Args ... args) {
+ std::vector<std::thread> v(numthreads);
+ for (int thr_i = 0; thr_i < numthreads; thr_i++) {
+ v[thr_i] = std::thread(function, args...);
+ }
+
+ for (int thr_i = 0; thr_i < numthreads; thr_i++) {
+ v[thr_i].join();
+ }
+ v.clear();
+}
--- /dev/null
+# Copyright (C) 2018-2019 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+set (TARGET_NAME "MemCheckTests")
+
+file (GLOB SRC
+ ../common/*.cpp
+ ../common/ie_pipelines/*.cpp
+ *.cpp
+ tests_pipelines/*.cpp)
+
+file (GLOB HDR
+ ../common/*.h
+ ../common/ie_pipelines/*.h
+ *.h
+ tests_pipelines/*.h)
+
+# Create library file from sources.
+add_executable(${TARGET_NAME} ${HDR} ${SRC})
+
+find_package(gflags REQUIRED)
+
+target_link_libraries(${TARGET_NAME}
+ IE::gtest
+ IE::gtest_main
+ IE::pugixml
+ gflags
+ ${InferenceEngine_LIBRARIES}
+ )
+
+target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}"
+ "${IE_MAIN_SOURCE_DIR}/thirdparty/pugixml/src")
+
+# Copy local configs to BIN_FOLDER
+configure_file(local_configs/test_config.xml ${OUTPUT_ROOT}/${BIN_FOLDER}/${CMAKE_BUILD_TYPE}/stress_tests_configs/memcheck_tests/test_config.xml COPYONLY)
+configure_file(local_configs/env_config.xml ${OUTPUT_ROOT}/${BIN_FOLDER}/${CMAKE_BUILD_TYPE}/stress_tests_configs/memcheck_tests/env_config.xml COPYONLY)
+configure_file(local_configs/references_config.xml ${OUTPUT_ROOT}/${BIN_FOLDER}/${CMAKE_BUILD_TYPE}/stress_tests_configs/memcheck_tests/references_config.xml COPYONLY)
--- /dev/null
+#pragma once
+
+#include "../common/utils.h"
+
+#include <gflags/gflags.h>
+
+/// @brief message for help argument
+static const char help_message[] = "Print a usage message";
+
+/// @brief Define flag for showing help message <br>
+DEFINE_bool(h, false, help_message);
+
+/// @brief Declare flag for showing help message <br>
+DECLARE_bool(help);
+
+/// @brief message for test_config argument
+static const char test_conf_message[] = "Optional. Path to a test config with description about number of threads, iterations etc.";
+
+/// @brief Define parameter for set test's configuration <br>
+/// test_conf is an optional parameter
+DEFINE_string(test_conf, OS_PATH_JOIN({"stress_tests_configs", "memcheck_tests", "test_config.xml"}), test_conf_message);
+
+/// @brief message for env_config argument
+static const char env_conf_message[] = "Optional. Path to an env config with paths to models etc.";
+
+/// @brief Define parameter for set environment <br>
+/// env_conf is an optional parameter
+DEFINE_string(env_conf, OS_PATH_JOIN({"stress_tests_configs", "memcheck_tests", "env_config.xml"}), env_conf_message);
+
+/// @brief message for env_config argument
+static const char refs_conf_message[] = "Optional. Path to a references config with values of memory consumption per test.";
+
+/// @brief Define parameter for set references' configuration <br>
+/// refs_conf is an optional parameter
+DEFINE_string(refs_conf, OS_PATH_JOIN({"stress_tests_configs", "memcheck_tests", "references_config.xml"}), refs_conf_message);
\ No newline at end of file
--- /dev/null
+<?xml version="1.0"?>
+<attributes>
+ <irs_path>
+ <value>/nfs/inn/proj/vdp/vdp_tests/stress_tests/open_model_zoo/efd238d02035f8a5417b7b1e25cd4c997d44351f/IRs</value>
+ </irs_path>
+</attributes>
--- /dev/null
+<?xml version="1.0"?>
+<attributes>
+ <models>
+<model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" test="create_exenetwork" device="CPU" vmsize="757218" vmpeak="901683" vmrss="73920" vmhwm="107866" />
+<model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" test="create_exenetwork" device="GPU" vmsize="747815" vmpeak="860978" vmrss="401808" vmhwm="435358" />
+<model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" test="infer_request_inference" device="CPU" vmsize="1001189" vmpeak="1001189" vmrss="116080" vmhwm="116080" />
+<model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" test="infer_request_inference" device="GPU" vmsize="788752" vmpeak="860842" vmrss="435283" vmhwm="435283" />
+<model path="public/mtcnn-r/FP32/mtcnn-r.xml" test="create_exenetwork" device="CPU" vmsize="754806" vmpeak="803184" vmrss="15206" vmhwm="26532" />
+<model path="public/mtcnn-r/FP32/mtcnn-r.xml" test="create_exenetwork" device="GPU" vmsize="554650" vmpeak="644666" vmrss="207592" vmhwm="217720" />
+<model path="public/mtcnn-r/FP32/mtcnn-r.xml" test="infer_request_inference" device="CPU" vmsize="959257" vmpeak="959257" vmrss="26690" vmhwm="26690" />
+<model path="public/mtcnn-r/FP32/mtcnn-r.xml" test="infer_request_inference" device="GPU" vmsize="572576" vmpeak="644666" vmrss="215230" vmhwm="215230" />
+<model path="public/ssd300/FP32/ssd300.xml" test="create_exenetwork" device="CPU" vmsize="755224" vmpeak="1146142" vmrss="22246" vmhwm="370770" />
+<model path="public/ssd300/FP32/ssd300.xml" test="create_exenetwork" device="GPU" vmsize="747709" vmpeak="1031694" vmrss="401746" vmhwm="749962" />
+<model path="public/ssd300/FP32/ssd300.xml" test="infer_request_inference" device="CPU" vmsize="1343474" vmpeak="1415563" vmrss="314204" vmhwm="371131" />
+<model path="public/ssd300/FP32/ssd300.xml" test="infer_request_inference" device="GPU" vmsize="1088700" vmpeak="1160790" vmrss="739626" vmhwm="748008" />
+<model path="public/vgg16/FP32/vgg16.xml" test="create_exenetwork" device="CPU" vmsize="754050" vmpeak="2548532" vmrss="15593" vmhwm="1808765" />
+<model path="public/vgg16/FP32/vgg16.xml" test="create_exenetwork" device="GPU" vmsize="648912" vmpeak="3289101" vmrss="299327" vmhwm="3003457" />
+<model path="public/vgg16/FP32/vgg16.xml" test="infer_request_inference" device="CPU" vmsize="2257006" vmpeak="2548532" vmrss="1243448" vmhwm="1809143" />
+<model path="public/vgg16/FP32/vgg16.xml" test="infer_request_inference" device="GPU" vmsize="2413290" vmpeak="3289101" vmrss="2059780" vmhwm="3006845" />
+ </models>
+</attributes>
--- /dev/null
+<?xml version="1.0"?>
+<attributes>
+ <devices>
+ <value>CPU</value>
+ <value>GPU</value>
+ </devices>
+ <models>
+ <value>public/vgg16/FP32/vgg16.xml</value>
+ <value>public/mtcnn-r/FP32/mtcnn-r.xml</value>
+ <value>public/mobilenet-ssd/FP32/mobilenet-ssd.xml</value>
+ <value>public/ssd300/FP32/ssd300.xml</value>
+ </models>
+</attributes>
--- /dev/null
+#include "flags.h"
+#include "../common/utils.h"
+#include <tests_utils.h>
+#include "../common/tests_utils.h"
+
+#include <gtest/gtest.h>
+#include <pugixml.hpp>
+
+
+bool parseAndCheckCommandLine(int argc, char **argv) {
+ // ---------------------------Parsing and validating input arguments--------------------------------------
+ log_info("Parsing input parameters");
+
+ int new_argc = 0;
+ std::vector<char*> _argv;
+ for (int i = 0; i < argc; i++) {
+ if ("--gtest" != std::string(argv[i]).substr(0, 7)) {
+ _argv.push_back(argv[i]);
+ new_argc++;
+ }
+ }
+ char **new_argv = &_argv[0];
+ gflags::ParseCommandLineNonHelpFlags(&new_argc, &new_argv, true);
+
+ if (FLAGS_help || FLAGS_h) {
+ // TODO print info
+ //::testing::InitGoogleTest(&argc, argv);
+ return false;
+ }
+
+ pugi::xml_document config;
+ pugi::xml_parse_result result = config.load_file(FLAGS_test_conf.c_str());
+ if (!result) {
+ log_err("Exception while reading test config \"" << FLAGS_test_conf << "\": " << result.description());
+ return false;
+ }
+ result = config.load_file(FLAGS_env_conf.c_str());
+ if (!result) {
+ log_err("Exception while reading env config \"" << FLAGS_env_conf << "\": " << result.description());
+ return false;
+ }
+ result = config.load_file(FLAGS_refs_conf.c_str());
+ if (!result) {
+ log_err("Exception while reading references config \"" << FLAGS_refs_conf << "\": " << result.description());
+ return false;
+ }
+ return true;
+}
+
+
+int main(int argc, char **argv) {
+ if (!parseAndCheckCommandLine(argc, argv)) {
+ return 0; // TODO return correct status
+ }
+
+ pugi::xml_document config;
+ config.load_file(FLAGS_test_conf.c_str());
+ Environment::Instance().setTestConfig(config);
+ config.load_file(FLAGS_env_conf.c_str());
+ Environment::Instance().setEnvConfig(config);
+ config.load_file(FLAGS_refs_conf.c_str());
+ MemCheckEnvironment::Instance().setRefsConfig(config);
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
\ No newline at end of file
--- /dev/null
+#include "tests_utils.h"
+#include "../common/tests_utils.h"
+#include "../common/managers/thread_manager.h"
+#include "tests_pipelines/tests_pipelines.h"
+
+#include <gtest/gtest.h>
+
+#define checkRefVmValues() \
+ ASSERT_GT(test_refs.ref_vmsize, 0) << "Reference value of VmSize is less than 0. Value: " << test_refs.ref_vmsize; \
+ ASSERT_GT(test_refs.ref_vmsize, 0) << "Reference value of VmPeak is less than 0. Value: " << test_refs.ref_vmpeak; \
+ ASSERT_GT(test_refs.ref_vmrss, 0) << "Reference value of VmRSS is less than 0. Value: " << test_refs.ref_vmrss; \
+ ASSERT_GT(test_refs.ref_vmrss, 0) << "Reference value of VmHWM is less than 0. Value: " << test_refs.ref_vmhwm;
+
+class MemCheckTestSuite : public ::testing::TestWithParam<TestCase> {
+};
+
+// tests_pipelines/tests_pipelines.cpp
+TEST_P(MemCheckTestSuite, create_exenetwork) {
+ std::string test_name = "create_exenetwork";
+ auto test_params = GetParam();
+
+ TestReferences test_refs;
+ test_refs.collect_vm_values_for_test(test_name, test_params);
+
+ checkRefVmValues();
+
+ TestResult res = test_create_exenetwork(test_params.model_name, test_params.model, test_params.device,
+ test_refs.ref_vmsize, test_refs.ref_vmpeak, test_refs.ref_vmrss,
+ test_refs.ref_vmhwm);
+ EXPECT_EQ(res.first, TestStatus::TEST_OK) << res.second;
+}
+
+TEST_P(MemCheckTestSuite, infer_request_inference) {
+ std::string test_name = "infer_request_inference";
+ auto test_params = GetParam();
+
+ TestReferences test_refs;
+ test_refs.collect_vm_values_for_test(test_name, test_params);
+
+ checkRefVmValues();
+
+ TestResult res = test_infer_request_inference(test_params.model_name, test_params.model, test_params.device,
+ test_refs.ref_vmsize, test_refs.ref_vmpeak, test_refs.ref_vmrss,
+ test_refs.ref_vmhwm);
+ EXPECT_EQ(res.first, TestStatus::TEST_OK) << res.second;
+}
+// tests_pipelines/tests_pipelines.cpp
+
+INSTANTIATE_TEST_CASE_P(MemCheckTests, MemCheckTestSuite,
+ ::testing::ValuesIn(
+ generateTestsParams({"devices", "models"})),
+ getTestCaseName);
--- /dev/null
+#include "tests_pipelines.h"
+
+#include <string>
+#include <math.h>
+#include <chrono>
+
+#include <inference_engine.hpp>
+
+#define REPORTING_THRESHOLD 1.1
+
+using namespace InferenceEngine;
+
+#define getAlignedVmValues(vmsize, vmpeak, vmrss, vmhwm, vmsize_to_align, vmrss_to_align) \
+ getVmValues(test_cur_vmsize, test_cur_vmpeak, test_cur_vmrss, test_cur_vmhwm); \
+ test_cur_vmsize -= vmsize_before_test; \
+ test_cur_vmpeak -= vmsize_before_test; \
+ test_cur_vmrss -= vmrss_before_test; \
+ test_cur_vmhwm -= vmrss_before_test;
+
+#define log_debug_ref_record_for_test(test_name) \
+ log_debug("Record to update reference config: " \
+ << "<model path=\"" + model_name + "\"" + " test=\"" + test_name + "\" device=\"" + \
+ target_device + \
+ "\" vmsize=\"" + std::to_string((int) (test_cur_vmsize * REPORTING_THRESHOLD)) + \
+ "\" vmpeak=\"" + std::to_string((int) (test_cur_vmpeak * REPORTING_THRESHOLD)) + \
+ "\" vmrss=\"" + std::to_string((int) (test_cur_vmrss * REPORTING_THRESHOLD)) + \
+ "\" vmhwm=\"" + std::to_string((int) (test_cur_vmhwm * REPORTING_THRESHOLD)) + "\" />");
+
+#define log_info_ref_mem_usage() \
+ log_info("Reference values of virtual memory consumption:"); \
+ log_info("VMRSS\t\tVMHWM\t\tVMSIZE\t\tVMPEAK"); \
+ log_info(ref_vmrss << "\t\t" << ref_vmhwm << "\t\t" << ref_vmsize << "\t\t" << ref_vmpeak);
+
+#define log_info_cur_mem_usage() \
+ log_info("Current values of virtual memory consumption:"); \
+ log_info("VMRSS\t\tVMHWM\t\tVMSIZE\t\tVMPEAK"); \
+ log_info(test_cur_vmrss << "\t\t" << test_cur_vmhwm << "\t\t" << test_cur_vmsize << "\t\t" << test_cur_vmpeak);
+
+TestResult
+test_create_exenetwork(const std::string &model_name, const std::string &model_path, const std::string &target_device,
+ const long &ref_vmsize, const long &ref_vmpeak, const long &ref_vmrss, const long &ref_vmhwm) {
+ log_info("Create ExecutableNetwork from network: \"" << model_path
+ << "\" for device: \"" << target_device << "\"");
+ long vmsize_before_test = 0, vmrss_before_test = 0,
+ test_cur_vmsize = 0, test_cur_vmpeak = 0,
+ test_cur_vmrss = 0, test_cur_vmhwm = 0;
+
+ vmsize_before_test = (long) getVmSizeInKB();
+ vmrss_before_test = (long) getVmRSSInKB();
+
+ create_exenetwork(model_path, target_device)();
+
+ getAlignedVmValues(test_cur_vmsize, test_cur_vmpeak, test_cur_vmrss, test_cur_vmhwm,
+ vmsize_before_test, vmrss_before_test);
+
+ log_debug_ref_record_for_test("create_exenetwork");
+ log_info_ref_mem_usage();
+ log_info_cur_mem_usage();
+
+ if (test_cur_vmhwm > ref_vmhwm)
+ return TestResult(TestStatus::TEST_FAILED,
+ "Test failed: HWM (peak of RSS) virtual memory consumption is greater than reference.\n"
+ "Reference HWM of memory consumption: " + std::to_string(ref_vmhwm) + " KB.\n" +
+ "Current HWM of memory consumption: " + std::to_string(test_cur_vmhwm) + " KB.\n");
+
+ return TestResult(TestStatus::TEST_OK, "");
+}
+
+TestResult
+test_infer_request_inference(const std::string &model_name, const std::string &model_path,
+ const std::string &target_device,
+ const long &ref_vmsize, const long &ref_vmpeak, const long &ref_vmrss,
+ const long &ref_vmhwm) {
+ log_info("Inference of InferRequest from network: \"" << model_path
+ << "\" for device: \"" << target_device << "\"");
+ long vmsize_before_test = 0, vmrss_before_test = 0,
+ test_cur_vmsize = 0, test_cur_vmpeak = 0,
+ test_cur_vmrss = 0, test_cur_vmhwm = 0;
+ std::chrono::system_clock::time_point t_start, t_end;
+ std::chrono::duration<double> t_diff;
+
+ vmsize_before_test = (long) getVmSizeInKB();
+ vmrss_before_test = (long) getVmRSSInKB();
+
+ Core ie;
+ CNNNetwork cnnNetwork = ie.ReadNetwork(model_path);
+ ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, target_device);
+ InferRequest infer_request = exeNetwork.CreateInferRequest();
+
+ log_info_ref_mem_usage();
+
+ t_start = std::chrono::system_clock::now();
+ int seconds = 1;
+ do {
+ infer_request.Infer();
+ OutputsDataMap output_info(cnnNetwork.getOutputsInfo());
+ for (auto &output : output_info)
+ Blob::Ptr outputBlob = infer_request.GetBlob(output.first);
+ t_end = std::chrono::system_clock::now();
+ t_diff = t_end - t_start;
+
+ getAlignedVmValues(test_cur_vmsize, test_cur_vmpeak, test_cur_vmrss, test_cur_vmhwm,
+ vmsize_before_test, vmrss_before_test);
+
+ if (test_cur_vmrss > ref_vmrss) {
+ log_debug_ref_record_for_test("infer_request_inference");
+ return TestResult(TestStatus::TEST_FAILED,
+ "Test failed: RSS virtual memory consumption became greater than reference "
+ "after " + std::to_string(t_diff.count()) + " sec of inference.\n"
+ "Reference RSS memory consumption: " + std::to_string(ref_vmrss) + " KB.\n" +
+ "Current RSS memory consumption: " + std::to_string(test_cur_vmrss) + " KB.\n");
+ }
+
+ if (t_diff.count() > (double) (seconds)) {
+ log_info("Current values of virtual memory consumption after " << seconds << " seconds:");
+ log_info("VMRSS\t\tVMHWM\t\tVMSIZE\t\tVMPEAK");
+ log_info(test_cur_vmrss << "\t\t" << test_cur_vmhwm << "\t\t" << test_cur_vmsize << "\t\t" << test_cur_vmpeak);
+ seconds++;
+ }
+ } while (t_diff.count() < 5);
+ log_debug_ref_record_for_test("infer_request_inference");
+
+ return TestResult(TestStatus::TEST_OK, "");
+}
--- /dev/null
+#pragma once
+
+#include "../../common/tests_utils.h"
+#include "../../common/utils.h"
+#include "../../common/ie_pipelines/pipelines.h"
+
+#include <string>
+
+// tests_pipelines/tests_pipelines.cpp
+TestResult test_create_exenetwork(const std::string &model_name, const std::string &model_path, const std::string &target_device,
+ const long &ref_vmsize, const long &ref_vmpeak, const long &ref_vmrss, const long &ref_vmhwm);
+TestResult test_infer_request_inference(const std::string &model_name, const std::string &model_path, const std::string &target_device,
+ const long &ref_vmsize, const long &ref_vmpeak, const long &ref_vmrss, const long &ref_vmhwm);
+// tests_pipelines/tests_pipelines.cpp
--- /dev/null
+#include "../common/tests_utils.h"
+
+#include <pugixml.hpp>
+
+class MemCheckEnvironment {
+private:
+ pugi::xml_document _refs_config;
+ MemCheckEnvironment() = default;
+ MemCheckEnvironment(const MemCheckEnvironment&) = delete;
+ MemCheckEnvironment& operator=(const MemCheckEnvironment&) = delete;
+public:
+ static MemCheckEnvironment& Instance(){
+ static MemCheckEnvironment env;
+ return env;
+ }
+
+ const pugi::xml_document & getRefsConfig() {
+ return _refs_config;
+ }
+
+ void setRefsConfig(const pugi::xml_document &refs_config) {
+ _refs_config.reset(refs_config);
+ }
+};
+
+class TestReferences {
+private:
+ std::vector<std::string> model_path_v, test_name_v, device_v;
+ std::vector<long> vmsize_v, vmpeak_v, vmrss_v, vmhwm_v;
+public:
+ long ref_vmsize = -1, ref_vmpeak = -1, ref_vmrss = -1, ref_vmhwm = -1;
+
+ TestReferences () {
+ // Parse RefsConfig from MemCheckEnvironment
+ std::string models_path = Environment::Instance().getEnvConfig()
+ .child("attributes").child("irs_path").child("value").text().as_string();
+
+ const pugi::xml_document &refs_config = MemCheckEnvironment::Instance().getRefsConfig();
+ auto values = refs_config.child("attributes").child("models");
+ for (pugi::xml_node node = values.first_child(); node; node = node.next_sibling()) {
+ for (pugi::xml_attribute_iterator ait = node.attributes_begin(); ait != node.attributes_end(); ait++) {
+ if (strncmp(ait->name(), "path", strlen(ait->name())) == 0) {
+ model_path_v.push_back(OS_PATH_JOIN({models_path, ait->value()}));
+ } else if (strncmp(ait->name(), "test", strlen(ait->name())) == 0) {
+ test_name_v.push_back(ait->value());
+ } else if (strncmp(ait->name(), "device", strlen(ait->name())) == 0) {
+ device_v.push_back(ait->value());
+ } else if (strncmp(ait->name(), "vmsize", strlen(ait->name())) == 0) {
+ vmsize_v.push_back(std::atoi(ait->value()));
+ } else if (strncmp(ait->name(), "vmpeak", strlen(ait->name())) == 0) {
+ vmpeak_v.push_back(std::atoi(ait->value()));
+ } else if (strncmp(ait->name(), "vmrss", strlen(ait->name())) == 0) {
+ vmrss_v.push_back(std::atoi(ait->value()));
+ } else if (strncmp(ait->name(), "vmhwm", strlen(ait->name())) == 0) {
+ vmhwm_v.push_back(std::atoi(ait->value()));
+ }
+ }
+ }
+ }
+
+ void collect_vm_values_for_test(std::string test_name, TestCase test_params) {
+ for (int i = 0; i < test_name_v.size(); i++)
+ if (test_name_v[i] == test_name)
+ if (model_path_v[i] == test_params.model)
+ if (device_v[i] == test_params.device) {
+ ref_vmsize = vmsize_v[i];
+ ref_vmpeak = vmpeak_v[i];
+ ref_vmrss = vmrss_v[i];
+ ref_vmhwm = vmhwm_v[i];
+ }
+ }
+};
\ No newline at end of file
--- /dev/null
+# Copyright (C) 2018-2019 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+set (TARGET_NAME "StressMemLeaksTests")
+
+file (GLOB SRC
+ ../common/*.cpp
+ ../common/ie_pipelines/*.cpp
+ *.cpp
+ tests_pipelines/*.cpp)
+
+file (GLOB HDR
+ ../common/*.h
+ ../common/managers/*.h
+ ../common/ie_pipelines/*.h
+ *.h
+ tests_pipelines/*.h)
+
+# Create library file from sources.
+add_executable(${TARGET_NAME} ${HDR} ${SRC})
+
+find_package(gflags REQUIRED)
+find_package(Threads REQUIRED)
+
+target_link_libraries(${TARGET_NAME}
+ IE::gtest
+ IE::gtest_main
+ IE::pugixml
+ gflags
+ Threads::Threads
+ ${InferenceEngine_LIBRARIES}
+ )
+
+target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}"
+ "${IE_MAIN_SOURCE_DIR}/thirdparty/pugixml/src")
+
+# Copy local configs to BIN_FOLDER
+configure_file(local_configs/test_config.xml ${OUTPUT_ROOT}/${BIN_FOLDER}/${CMAKE_BUILD_TYPE}/stress_tests_configs/memleaks_tests/test_config.xml COPYONLY)
+configure_file(local_configs/env_config.xml ${OUTPUT_ROOT}/${BIN_FOLDER}/${CMAKE_BUILD_TYPE}/stress_tests_configs/memleaks_tests/env_config.xml COPYONLY)
--- /dev/null
+#pragma once
+
+#include "../common/utils.h"
+
+#include <gflags/gflags.h>
+
+/// @brief message for help argument
+static const char help_message[] = "Print a usage message";
+
+/// @brief Define flag for showing help message <br>
+DEFINE_bool(h, false, help_message);
+
+/// @brief Declare flag for showing help message <br>
+DECLARE_bool(help);
+
+/// @brief message for test_config argument
+static const char test_conf_message[] = "Optional. Path to a test config with description about number of threads, iterations etc.";
+
+/// @brief Define parameter for set test's configuration <br>
+/// test_conf is an optional parameter
+DEFINE_string(test_conf, OS_PATH_JOIN({"stress_tests_configs", "memleaks_tests", "test_config.xml"}), test_conf_message);
+
+/// @brief message for env_config argument
+static const char env_conf_message[] = "Optional. Path to an env config with paths to models etc.";
+
+/// @brief Define parameter for set environment <br>
+/// env_conf is an optional parameter
+DEFINE_string(env_conf, OS_PATH_JOIN({"stress_tests_configs", "memleaks_tests", "env_config.xml"}), env_conf_message);
\ No newline at end of file
--- /dev/null
+<?xml version="1.0"?>
+<attributes>
+ <irs_path>
+ <value>/nfs/inn/proj/vdp/vdp_tests/stress_tests/master_04d6f112132f92cab563ae7655747e0359687dc9/</value>
+ </irs_path>
+</attributes>
--- /dev/null
+<?xml version="1.0"?>
+<attributes>
+<!--[ WARNING ] Use of attribute "processes" from config isn't implemented yet. It will be ignored.-->
+ <processes>
+ <value>1</value>
+ </processes>
+ <threads>
+ <value>1</value>
+ </threads>
+ <iterations>
+ <value>30</value>
+ </iterations>
+ <devices>
+ <value>CPU</value>
+<!-- <value>GPU</value>-->
+ </devices>
+ <models>
+ <value>caffe/FP32/alexnet/alexnet.xml</value>
+ </models>
+</attributes>
--- /dev/null
+#include "flags.h"
+#include "../common/utils.h"
+#include "../common/tests_utils.h"
+
+#include <gtest/gtest.h>
+#include <pugixml.hpp>
+
+
+bool parseAndCheckCommandLine(int argc, char **argv) {
+ // ---------------------------Parsing and validating input arguments--------------------------------------
+ log_info("Parsing input parameters");
+
+ int new_argc = 0;
+ std::vector<char*> _argv;
+ for (int i = 0; i < argc; i++) {
+ if ("--gtest" != std::string(argv[i]).substr(0, 7)) {
+ _argv.push_back(argv[i]);
+ new_argc++;
+ }
+ }
+ char **new_argv = &_argv[0];
+ gflags::ParseCommandLineNonHelpFlags(&new_argc, &new_argv, true);
+
+ if (FLAGS_help || FLAGS_h) {
+ // TODO print info
+ //::testing::InitGoogleTest(&argc, argv);
+ return false;
+ }
+
+ pugi::xml_document config;
+ pugi::xml_parse_result result = config.load_file(FLAGS_test_conf.c_str());
+ if (!result) {
+ log_err("Exception while reading test config \"" << FLAGS_test_conf << "\": " << result.description());
+ return false;
+ }
+ result = config.load_file(FLAGS_env_conf.c_str());
+ if (!result) {
+ log_err("Exception while reading env config \"" << FLAGS_env_conf << "\": " << result.description());
+ return false;
+ }
+ return true;
+}
+
+
+int main(int argc, char **argv) {
+ log_warn("Use of attribute \"processes\" from config isn't implemented yet. It will be ignored.");
+ log_warn("Use of attribute \"threads\" from config greater than 1 is risky because of "
+ "no synchronization between steps from different threads. Tests results may be non-deterministic.");
+ if (!parseAndCheckCommandLine(argc, argv)) {
+ return 0; // TODO return correct status
+ }
+
+ pugi::xml_document config;
+ config.load_file(FLAGS_test_conf.c_str());
+ Environment::Instance().setTestConfig(config);
+ config.load_file(FLAGS_env_conf.c_str());
+ Environment::Instance().setEnvConfig(config);
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
\ No newline at end of file
--- /dev/null
+#include "../common/tests_utils.h"
+#include "../common/managers/thread_manager.h"
+#include "tests_pipelines/tests_pipelines.h"
+
+#include <inference_engine.hpp>
+
+#include <gtest/gtest.h>
+
+using namespace InferenceEngine;
+
+class MemLeaksTestSuiteNoModel : public ::testing::TestWithParam<TestCase> {
+};
+
+class MemLeaksTestSuiteNoDevice : public ::testing::TestWithParam<TestCase> {
+};
+
+class MemLeaksTestSuite : public ::testing::TestWithParam<TestCase> {
+};
+
+inline void test_runner(int numthreads, const std::function<TestResult()> &test_function) {
+ ThreadManager<TestResult> thr_manager;
+ for (int i = 0; i < numthreads; i++)
+ thr_manager.add_task(test_function);
+ thr_manager.run_parallel_n_wait();
+
+ std::vector<ManagerStatus> statuses = thr_manager.get_all_statuses();
+ std::vector<TestResult> results = thr_manager.get_all_results();
+
+ for (int i = 0; i < numthreads; i++) {
+ EXPECT_EQ(statuses[i], ManagerStatus::FINISHED_SUCCESSFULLY)
+ << "[Thread " << i << "] Thread not finished successfully";
+ EXPECT_EQ(results[i].first, TestStatus::TEST_OK) << "[Thread " << i << "] " << results[i].second;
+ }
+}
+
+
+// tests_pipelines/tests_pipelines.cpp
+TEST_P(MemLeaksTestSuiteNoModel, load_unload_plugin) {
+ auto test_params = GetParam();
+ auto test = [&] {
+ return test_load_unload_plugin(test_params.device, test_params.numiters);
+ };
+ test_runner(test_params.numthreads, test);
+}
+
+TEST_P(MemLeaksTestSuiteNoDevice, read_network) {
+ auto test_params = GetParam();
+ auto test = [&] {
+ return test_read_network(test_params.model, test_params.numiters);
+ };
+ test_runner(test_params.numthreads, test);
+}
+
+TEST_P(MemLeaksTestSuiteNoDevice, create_cnnnetwork) {
+ auto test_params = GetParam();
+ auto test = [&] {
+ return test_create_cnnnetwork(test_params.model, test_params.numiters);
+ };
+ test_runner(test_params.numthreads, test);
+}
+
+TEST_P(MemLeaksTestSuiteNoDevice, cnnnetwork_reshape_batch_x2) {
+ auto test_params = GetParam();
+ auto test = [&] {
+ return test_cnnnetwork_reshape_batch_x2(test_params.model, test_params.numiters);
+ };
+ test_runner(test_params.numthreads, test);
+}
+
+TEST_P(MemLeaksTestSuiteNoDevice, set_input_params) {
+ auto test_params = GetParam();
+ auto test = [&] {
+ return test_set_input_params(test_params.model, test_params.numiters);
+ };
+ test_runner(test_params.numthreads, test);
+}
+
+TEST_P(MemLeaksTestSuite, recreate_exenetwork) {
+ auto test_params = GetParam();
+ Core ie;
+ auto test = [&] {
+ return test_recreate_exenetwork(ie, test_params.model, test_params.device, test_params.numiters);
+ };
+ test_runner(test_params.numthreads, test);
+}
+
+TEST_P(MemLeaksTestSuite, recreate_infer_request) {
+ auto test_params = GetParam();
+ Core ie;
+ CNNNetwork cnnNetwork = ie.ReadNetwork(test_params.model);
+ ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, test_params.device);
+ auto test = [&] {
+ return test_recreate_infer_request(exeNetwork, test_params.model, test_params.device, test_params.numiters);
+ };
+ test_runner(test_params.numthreads, test);
+}
+
+TEST_P(MemLeaksTestSuite, reinfer_request_inference) {
+ auto test_params = GetParam();
+ auto test = [&] {
+ Core ie;
+ CNNNetwork cnnNetwork = ie.ReadNetwork(test_params.model);
+ ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, test_params.device);
+ InferRequest infer_request = exeNetwork.CreateInferRequest();
+ return test_reinfer_request_inference(infer_request, cnnNetwork, test_params.model, test_params.device, test_params.numiters);
+ };
+ test_runner(test_params.numthreads, test);
+}
+// tests_pipelines/tests_pipelines.cpp
+
+INSTANTIATE_TEST_CASE_P(MemLeaksTests, MemLeaksTestSuiteNoModel,
+ ::testing::ValuesIn(generateTestsParams({"processes", "threads", "iterations", "devices"})),
+ getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(MemLeaksTests, MemLeaksTestSuiteNoDevice,
+ ::testing::ValuesIn(generateTestsParams({"processes", "threads", "iterations", "models"})),
+ getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(MemLeaksTests, MemLeaksTestSuite,
+ ::testing::ValuesIn(
+ generateTestsParams({"processes", "threads", "iterations", "devices", "models"})),
+ getTestCaseName);
+
--- /dev/null
+#include "tests_pipelines.h"
+
+#include <math.h>
+
+#include <inference_engine.hpp>
+#include <algorithm>
+#include <array>
+#include <string>
+
+using namespace InferenceEngine;
+
+// Number of pipeline runs before it starts measuring
+#define WARMUP_STEPS 30
+// Number memory peaks ignored. LibC memory manager can produce peaks with
+// overall flat consumption
+#define MAX_OUTLIERS 5
+// Maximum number of measuring pipeline restarts
+#define MAX_RETRY 3
+// A threshold for which memory growth will be considered an error
+#define THRESHOLD 0.1
+
+// Measure values
+enum MeasureValue { VMRSS = 0, VMHWM, VMSIZE, VMPEAK, MeasureValueMax };
+
+namespace util {
+template <typename In, typename Out, typename Func>
+void transform(const In& in, Out& out, const Func& func) {
+ std::transform(std::begin(in), std::end(in), std::begin(out), func);
+}
+
+template <typename In1, typename In2, typename Out, typename Func>
+void transform(const In1& in1, const In2& in2, Out& out, const Func& func) {
+ std::transform(std::begin(in1), std::end(in1), std::begin(in2), std::begin(out), func);
+}
+} // namespace util
+
+TestResult common_test_pipeline(const std::function<void()>& test_pipeline, const int& n) {
+ int retry_count = 0;
+ float mem_threshold = THRESHOLD;
+ std::array<long, MeasureValueMax> cur = {0}; // measured for current iteration
+ std::array<long, MeasureValueMax> ref = {0}; // recorded reference
+ std::array<long, MeasureValueMax> diff = {0}; // difference between current and reference
+ std::array<bool, MeasureValueMax> outlier = {0}; // flag if current does not fit threshold
+ std::array<int, MeasureValueMax> outlier_count = {0}; // counter for how many times current does not fit threshold
+ std::array<float, MeasureValueMax> threshold = {0}; // ref * THRESHOLD
+ std::string progress_str;
+
+ progress_str.reserve(1024);
+
+ log_info("Warming up for " << WARMUP_STEPS << " iterations");
+ log_info("i\tVMRSS\tVMHWM\tVMSIZE\tVMPEAK");
+ int measure_count = n;
+ for (int iteration = 0; measure_count > 0; iteration++) {
+ // Warm up to take reference values
+ test_pipeline();
+ getVmValues(cur[VMSIZE], cur[VMPEAK], cur[VMRSS], cur[VMHWM]);
+ progress_str = std::to_string(iteration + 1) + "\t" + std::to_string(cur[VMRSS]) + "\t" +
+ std::to_string(cur[VMHWM]) + "\t" + std::to_string(cur[VMSIZE]) + "\t" +
+ std::to_string(cur[VMPEAK]);
+
+ // measure
+ if (iteration >= WARMUP_STEPS) {
+ // set reference
+ if (WARMUP_STEPS == iteration || (retry_count < MAX_RETRY && (outlier_count[VMRSS] > MAX_OUTLIERS ||
+ outlier_count[VMHWM] > MAX_OUTLIERS))) {
+ if (0 != retry_count) log_info("Retrying " << retry_count + 1 << " of " << MAX_RETRY);
+ retry_count++;
+ measure_count = n;
+ outlier_count = {0};
+ ref = cur;
+ util::transform(ref, threshold, [](long ref_val) -> float {
+ return THRESHOLD * ref_val;
+ });
+ log_info("Setting thresholds VMRSS=" << ref[VMRSS] << "(+-" << static_cast<int>(threshold[VMRSS])
+ << "), VMHWM=" << ref[VMHWM] << "(+-"
+ << static_cast<int>(threshold[VMHWM]) << ")");
+ }
+ measure_count--;
+ util::transform(cur, ref, diff, [](long cur_val, long ref_val) -> long {
+ return labs(cur_val - ref_val);
+ });
+ util::transform(diff, threshold, outlier, [](long diff_val, float threshold_val) -> bool {
+ return diff_val > threshold_val;
+ });
+ util::transform(outlier, outlier_count, outlier_count,
+ [](bool outlier_val, long outlier_count_val) -> long {
+ return outlier_count_val + (outlier_val ? 1 : 0);
+ });
+
+ if (outlier[VMRSS]) {
+ progress_str += "\t<-VMRSS outlier";
+ }
+ if (outlier[VMHWM]) {
+ progress_str += "\t<-VMHWM outlier";
+ }
+ }
+
+ log_info(progress_str);
+ }
+
+ if (outlier_count[VMRSS] > MAX_OUTLIERS)
+ return TestResult(TestStatus::TEST_FAILED, "Test failed: RSS virtual memory consumption grown too much.");
+
+ if (outlier_count[VMHWM] > MAX_OUTLIERS)
+ return TestResult(TestStatus::TEST_FAILED, "Test failed: HWM virtual memory consumption grown too much.");
+
+ return TestResult(TestStatus::TEST_OK, "");
+}
+
+
+TestResult test_load_unload_plugin(const std::string &target_device, const int &n) {
+ log_info("Load/unload plugin for device: " << target_device << " for " << n << " times");
+ return common_test_pipeline(load_unload_plugin(target_device), n);
+}
+
+TestResult test_read_network(const std::string &model, const int &n) {
+ log_info("Read network: \"" << model << "\" for " << n << " times");
+ return common_test_pipeline(read_network(model), n);
+}
+
+TestResult test_create_cnnnetwork(const std::string &model, const int &n) {
+ log_info("Create CNNNetwork from network: \"" << model << "\" for " << n << " times");
+ return common_test_pipeline(create_cnnnetwork(model), n);
+}
+
+TestResult test_cnnnetwork_reshape_batch_x2(const std::string &model, const int &n) {
+ log_info("Reshape to batch*=2 of CNNNetwork created from network: \"" << model << "\" for " << n << " times");
+ return common_test_pipeline(cnnnetwork_reshape_batch_x2(model), n);
+}
+
+TestResult test_set_input_params(const std::string &model, const int &n) {
+ log_info("Apply preprocessing for CNNNetwork from network: \"" << model << "\" for " << n << " times");
+ return common_test_pipeline(set_input_params(model), n);
+}
+
+TestResult test_create_exenetwork(const std::string &model, const std::string &target_device, const int &n) {
+ log_info("Create ExecutableNetwork from network: \"" << model
+ << "\" for device: \"" << target_device << "\" for " << n
+ << " times");
+ return common_test_pipeline(create_exenetwork(model, target_device), n);
+}
+
+TestResult
+test_recreate_exenetwork(InferenceEngine::Core &ie, const std::string &model, const std::string &target_device,
+ const int &n) {
+ log_info("Recreate ExecutableNetwork from network within existing InferenceEngine::Core: \"" << model
+ << "\" for device: \""
+ << target_device
+ << "\" for " << n
+ << " times");
+ return common_test_pipeline(recreate_exenetwork(ie, model, target_device), n);
+}
+
+TestResult test_create_infer_request(const std::string &model, const std::string &target_device, const int &n) {
+ log_info("Create InferRequest from network: \"" << model
+ << "\" for device: \"" << target_device << "\" for " << n
+ << " times");
+ return common_test_pipeline(create_infer_request(model, target_device), n);
+}
+
+TestResult
+test_recreate_infer_request(ExecutableNetwork &network, const std::string &model, const std::string &target_device,
+ const int &n) {
+ log_info("Create InferRequest from network: \"" << model
+ << "\" for device: \"" << target_device << "\" for " << n
+ << " times");
+ return common_test_pipeline(recreate_infer_request(network), n);
+}
+
+TestResult
+test_infer_request_inference(const std::string &model, const std::string &target_device, const int &n) {
+ log_info("Inference of InferRequest from network: \"" << model
+ << "\" for device: \"" << target_device << "\" for " << n
+ << " times");
+ return common_test_pipeline(infer_request_inference(model, target_device), n);
+}
+
+TestResult
+test_reinfer_request_inference(InferenceEngine::InferRequest &infer_request, InferenceEngine::CNNNetwork &cnnNetwork,
+ const std::string &model, const std::string &target_device, const int &n) {
+ log_info("Inference of InferRequest from network: \"" << model
+ << "\" for device: \"" << target_device << "\" for " << n
+ << " times");
+ return common_test_pipeline(reinfer_request_inference(infer_request, cnnNetwork), n);
+}
--- /dev/null
+#pragma once
+
+#include "../../common/tests_utils.h"
+#include "../../common/utils.h"
+#include "../../common/ie_pipelines/pipelines.h"
+
+#include <string>
+
+#include <inference_engine.hpp>
+
+// tests_pipelines/tests_pipelines.cpp
+TestResult test_load_unload_plugin(const std::string &target_device, const int &n);
+TestResult test_read_network(const std::string &model, const int &n);
+TestResult test_create_cnnnetwork(const std::string &model, const int &n);
+TestResult test_cnnnetwork_reshape_batch_x2(const std::string &model, const int &n);
+TestResult test_set_input_params(const std::string &model, const int &n);
+TestResult test_recreate_exenetwork(InferenceEngine::Core &ie, const std::string &model, const std::string &target_device, const int &n);
+TestResult test_create_infer_request(const std::string &model, const std::string &target_device, const int &n);
+TestResult test_recreate_infer_request(InferenceEngine::ExecutableNetwork& network, const std::string &model, const std::string &target_device, const int &n);
+TestResult test_infer_request_inference(const std::string &model, const std::string &target_device, const int &n);
+TestResult test_reinfer_request_inference(InferenceEngine::InferRequest& infer_request, InferenceEngine::CNNNetwork& cnnNetwork, const std::string &model, const std::string &target_device, const int &n);
+// tests_pipelines/tests_pipelines.cpp
--- /dev/null
+#!/usr/bin/env python3
+""" Script to acquire model IRs for stress tests.
+Usage: ./scrips/get_testdata.py
+"""
+import argparse
+import multiprocessing
+import os
+import shutil
+import subprocess
+from inspect import getsourcefile
+
+# Parameters
+MODEL_NAMES = 'vgg16,mtcnn-r,mobilenet-ssd,ssd300'
+OMZ_VERSION = 'efd238d02035f8a5417b7b1e25cd4c997d44351f'
+
+
+def abs_path(relative_path):
+ """Return absolute path given path relative to the current file.
+ """
+ return os.path.realpath(
+ os.path.join(os.path.dirname(getsourcefile(lambda: 0)), relative_path))
+
+
+def main():
+ """Main entry point.
+ """
+ parser = argparse.ArgumentParser(
+ description='Acquire test data',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+ parser.add_argument('--output_dir', default=f'./_models', help='directory to put test data into')
+ parser.add_argument('--cache_dir', default=f'./_cache', help='directory with test data cache')
+ args = parser.parse_args()
+
+ # Clone Open Model Zoo into temporary path
+ omz_path = './_open_model_zoo'
+ if os.path.exists(omz_path):
+ shutil.rmtree(omz_path)
+ subprocess.check_call(
+ f'git clone https://github.com/opencv/open_model_zoo {omz_path}' \
+ f' && cd {omz_path}'\
+ f' && git checkout {OMZ_VERSION}', shell=True)
+ # Acquire model IRs
+ mo_tool = abs_path('../../../model-optimizer/mo.py')
+ subprocess.check_call(
+ f'{omz_path}/tools/downloader/downloader.py --name "{MODEL_NAMES}"' \
+ f' --output_dir {args.output_dir}/{OMZ_VERSION}/models' \
+ f' --cache_dir {args.cache_dir}', shell=True)
+ subprocess.check_call(
+ f'{omz_path}/tools/downloader/converter.py --name "{MODEL_NAMES}"' \
+ f' --output_dir {args.output_dir}/{OMZ_VERSION}/IRs' \
+ f' --download_dir {args.output_dir}/{OMZ_VERSION}/models' \
+ f' --mo {mo_tool} --jobs {multiprocessing.cpu_count()}', shell=True)
+
+
+if __name__ == "__main__":
+ main()
--- /dev/null
+# Copyright (C) 2018-2020 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+set (TARGET_NAME "StressUnitTests")
+
+file (GLOB SRC
+ ../common/*.cpp
+ ../common/ie_pipelines/*.cpp
+ *.cpp
+ tests_pipelines/*.cpp)
+
+file (GLOB HDR
+ ../common/*.h
+ ../common/managers/*.h
+ ../common/ie_pipelines/*.h
+ *.h
+ tests_pipelines/*.h)
+
+# Create library file from sources.
+add_executable(${TARGET_NAME} ${HDR} ${SRC})
+
+find_package(gflags REQUIRED)
+find_package(Threads REQUIRED)
+
+target_link_libraries(${TARGET_NAME}
+ IE::gtest
+ IE::gtest_main
+ IE::pugixml
+ gflags
+ Threads::Threads
+ ${InferenceEngine_LIBRARIES}
+ )
+
+target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}"
+ "${IE_MAIN_SOURCE_DIR}/thirdparty/pugixml/src")
+
+# Copy local configs to BIN_FOLDER
+configure_file(local_configs/test_config.xml ${OUTPUT_ROOT}/${BIN_FOLDER}/${CMAKE_BUILD_TYPE}/stress_tests_configs/unittests/test_config.xml COPYONLY)
+configure_file(local_configs/env_config.xml ${OUTPUT_ROOT}/${BIN_FOLDER}/${CMAKE_BUILD_TYPE}/stress_tests_configs/unittests/env_config.xml COPYONLY)
--- /dev/null
+#pragma once
+
+#include "../common/utils.h"
+
+#include <gflags/gflags.h>
+
+/// @brief message for help argument
+static const char help_message[] = "Print a usage message";
+
+/// @brief Define flag for showing help message <br>
+DEFINE_bool(h, false, help_message);
+
+/// @brief Declare flag for showing help message <br>
+DECLARE_bool(help);
+
+/// @brief message for test_config argument
+static const char test_conf_message[] = "Optional. Path to a test config with description about number of threads, iterations etc.";
+
+/// @brief Define parameter for set test's configuration <br>
+/// test_conf is an optional parameter
+DEFINE_string(test_conf, OS_PATH_JOIN({"stress_tests_configs", "unittests", "test_config.xml"}), test_conf_message);
+
+/// @brief message for env_config argument
+static const char env_conf_message[] = "Optional. Path to an env config with paths to models etc.";
+
+/// @brief Define parameter for set environment <br>
+/// env_conf is an optional parameter
+DEFINE_string(env_conf, OS_PATH_JOIN({"stress_tests_configs", "unittests", "env_config.xml"}), env_conf_message);
\ No newline at end of file
--- /dev/null
+<?xml version="1.0"?>
+<attributes>
+ <irs_path>
+ <value>/nfs/inn/proj/vdp/vdp_tests/stress_tests/master_04d6f112132f92cab563ae7655747e0359687dc9/</value>
+ </irs_path>
+</attributes>
--- /dev/null
+<?xml version="1.0"?>
+<attributes>
+ <processes>
+ <value>1</value>
+ </processes>
+ <threads>
+ <value>1</value>
+ </threads>
+ <iterations>
+ <value>100</value>
+ </iterations>
+ <devices>
+ <value>CPU</value>
+ <value>GPU</value>
+ </devices>
+ <models>
+ <value>caffe/FP32/alexnet/alexnet.xml</value>
+ </models>
+</attributes>
--- /dev/null
+#include "flags.h"
+#include "../common/utils.h"
+#include "../common/tests_utils.h"
+
+#include <gtest/gtest.h>
+#include <pugixml.hpp>
+
+
+bool parseAndCheckCommandLine(int argc, char **argv) {
+ // ---------------------------Parsing and validating input arguments--------------------------------------
+ log_info("Parsing input parameters");
+
+ int new_argc = 0;
+ std::vector<char*> _argv;
+ for (int i = 0; i < argc; i++) {
+ if ("--gtest" != std::string(argv[i]).substr(0, 7)) {
+ _argv.push_back(argv[i]);
+ new_argc++;
+ }
+ }
+ char **new_argv = &_argv[0];
+ gflags::ParseCommandLineNonHelpFlags(&new_argc, &new_argv, true);
+
+ if (FLAGS_help || FLAGS_h) {
+ // TODO print info
+ //::testing::InitGoogleTest(&argc, argv);
+ return false;
+ }
+
+ pugi::xml_document config;
+ pugi::xml_parse_result result = config.load_file(FLAGS_test_conf.c_str());
+ if (!result) {
+ log_err("Exception while reading test config \"" << FLAGS_test_conf << "\": " << result.description());
+ return false;
+ }
+ result = config.load_file(FLAGS_env_conf.c_str());
+ if (!result) {
+ log_err("Exception while reading env config \"" << FLAGS_env_conf << "\": " << result.description());
+ return false;
+ }
+ return true;
+}
+
+
+int main(int argc, char **argv) {
+ if (!parseAndCheckCommandLine(argc, argv)) {
+ return 0; // TODO return correct status
+ }
+
+ pugi::xml_document config;
+ config.load_file(FLAGS_test_conf.c_str());
+ Environment::Instance().setTestConfig(config);
+ config.load_file(FLAGS_env_conf.c_str());
+ Environment::Instance().setEnvConfig(config);
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
\ No newline at end of file
--- /dev/null
+#include "../common/tests_utils.h"
+#include "tests_pipelines/tests_pipelines.h"
+
+#include <gtest/gtest.h>
+
+class UnitTestSuiteNoModel : public ::testing::TestWithParam<TestCase> {
+};
+
+class UnitTestSuiteNoDevice : public ::testing::TestWithParam<TestCase> {
+};
+
+class UnitTestSuite : public ::testing::TestWithParam<TestCase> {
+};
+
+// tests_pipelines/tests_pipelines.cpp
+TEST_P(UnitTestSuiteNoModel, load_unload_plugin) {
+ runTest(test_load_unload_plugin, GetParam());
+}
+
+TEST_P(UnitTestSuiteNoDevice, read_network) {
+ runTest(test_read_network, GetParam());
+}
+
+TEST_P(UnitTestSuiteNoDevice, create_cnnnetwork) {
+ runTest(test_create_cnnnetwork, GetParam());
+}
+
+TEST_P(UnitTestSuiteNoDevice, cnnnetwork_reshape_batch_x2) {
+ runTest(test_cnnnetwork_reshape_batch_x2, GetParam());
+}
+
+TEST_P(UnitTestSuiteNoDevice, set_input_params) {
+ runTest(test_set_input_params, GetParam());
+}
+
+TEST_P(UnitTestSuite, create_exenetwork) {
+ runTest(test_create_exenetwork, GetParam());
+}
+
+TEST_P(UnitTestSuite, create_infer_request) {
+ runTest(test_create_infer_request, GetParam());
+}
+
+TEST_P(UnitTestSuite, infer_request_inference) {
+ runTest(test_infer_request_inference, GetParam());
+}
+// tests_pipelines/tests_pipelines.cpp
+
+
+// tests_pipelines/tests_pipelines_full_pipeline.cpp
+TEST_P(UnitTestSuite, load_unload_plugin_full_pipeline) {
+ runTest(test_load_unload_plugin_full_pipeline, GetParam());
+}
+
+TEST_P(UnitTestSuite, read_network_full_pipeline) {
+ runTest(test_read_network_full_pipeline, GetParam());
+}
+
+TEST_P(UnitTestSuite, create_cnnnetwork_full_pipeline) {
+ runTest(test_create_cnnnetwork_full_pipeline, GetParam());
+}
+
+TEST_P(UnitTestSuite, set_input_params_full_pipeline) {
+ runTest(test_set_input_params_full_pipeline, GetParam());
+}
+
+TEST_P(UnitTestSuite, cnnnetwork_reshape_batch_x2_full_pipeline) {
+ runTest(test_cnnnetwork_reshape_batch_x2_full_pipeline, GetParam());
+}
+
+TEST_P(UnitTestSuite, create_exenetwork_full_pipeline) {
+ runTest(test_create_exenetwork_full_pipeline, GetParam());
+}
+
+TEST_P(UnitTestSuite, create_infer_request_full_pipeline) {
+ runTest(test_create_infer_request_full_pipeline, GetParam());
+}
+
+TEST_P(UnitTestSuite, infer_request_inference_full_pipeline) {
+ runTest(test_infer_request_inference_full_pipeline, GetParam());
+}
+// tests_pipelines/tests_pipelines_full_pipeline.cpp
+
+INSTANTIATE_TEST_CASE_P(StressUnitTests, UnitTestSuiteNoModel,
+ ::testing::ValuesIn(generateTestsParams({"processes", "threads", "iterations", "devices"})),
+ getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(StressUnitTests, UnitTestSuiteNoDevice,
+ ::testing::ValuesIn(generateTestsParams({"processes", "threads", "iterations", "models"})),
+ getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(StressUnitTests, UnitTestSuite,
+ ::testing::ValuesIn(generateTestsParams({"processes", "threads", "iterations", "devices", "models"})),
+ getTestCaseName);
--- /dev/null
+#include "tests_pipelines.h"
+
+#include <string>
+
+#include <inference_engine.hpp>
+
+
+using namespace InferenceEngine;
+
+void test_load_unload_plugin(const std::string &model, const std::string &target_device, const int &n) {
+ log_info("Load/unload plugin for device: " << target_device << " for " << n << " times");
+ for (int i = 0; i < n; i++) {
+ if (i == n / 2) {
+ log_info("Half of the test have already passed");
+ }
+ load_unload_plugin(target_device)();
+ }
+}
+
+void test_read_network(const std::string &model, const std::string &target_device, const int &n) {
+ log_info("Read network: \"" << model << "\" for " << n << " times");
+ for (int i = 0; i < n; i++) {
+ if (i == n / 2) {
+ log_info("Half of the test have already passed");
+ }
+ read_network(model)();
+ }
+}
+
+void test_create_cnnnetwork(const std::string &model, const std::string &target_device, const int &n) {
+ log_info("Create CNNNetwork from network: \"" << model << "\" for " << n << " times");
+ for (int i = 0; i < n; i++) {
+ if (i == n / 2) {
+ log_info("Half of the test have already passed");
+ }
+ create_cnnnetwork(model)();
+ }
+}
+
+void test_cnnnetwork_reshape_batch_x2(const std::string &model, const std::string &target_device, const int &n) {
+ log_info("Reshape to batch*=2 of CNNNetwork created from network: \"" << model << "\" for " << n << " times");
+ for (int i = 0; i < n; i++) {
+ if (i == n / 2) {
+ log_info("Half of the test have already passed");
+ }
+ cnnnetwork_reshape_batch_x2(model)();
+ }
+}
+
+void test_set_input_params(const std::string &model, const std::string &target_device, const int &n) {
+ log_info("Apply preprocessing for CNNNetwork from network: \"" << model << "\" for " << n << " times");
+ for (int i = 0; i < n; i++) {
+ if (i == n / 2) {
+ log_info("Half of the test have already passed");
+ }
+ set_input_params(model)();
+ }
+}
+
+void test_create_exenetwork(const std::string &model, const std::string &target_device, const int &n) {
+ log_info("Create ExecutableNetwork from network: \"" << model
+ << "\" for device: \"" << target_device << "\" for " << n << " times");
+ for (int i = 0; i < n; i++) {
+ if (i == n / 2) {
+ log_info("Half of the test have already passed");
+ }
+ create_exenetwork(model, target_device)();
+ }
+}
+
+void test_create_infer_request(const std::string &model, const std::string &target_device, const int &n) {
+ log_info("Create InferRequest from network: \"" << model
+ << "\" for device: \"" << target_device << "\" for " << n << " times");
+ for (int i = 0; i < n; i++) {
+ if (i == n / 2) {
+ log_info("Half of the test have already passed");
+ }
+ create_infer_request(model, target_device)();
+ }
+}
+
+void test_infer_request_inference(const std::string &model, const std::string &target_device, const int &n) {
+ log_info("Inference of InferRequest from network: \"" << model
+ << "\" for device: \"" << target_device << "\" for " << n << " times");
+ for (int i = 0; i < n; i++) {
+ if (i == n / 2) {
+ log_info("Half of the test have already passed");
+ }
+ infer_request_inference(model, target_device)();
+ }
+}
--- /dev/null
+#pragma once
+
+#include "../../common/utils.h"
+#include "../../common/ie_pipelines/pipelines.h"
+
+#include <string>
+
+// tests_pipelines/tests_pipelines.cpp
+void test_load_unload_plugin(const std::string &model, const std::string &target_device, const int &n);
+void test_read_network(const std::string &model, const std::string &target_device, const int &n);
+void test_create_cnnnetwork(const std::string &model, const std::string &target_device, const int &n);
+void test_cnnnetwork_reshape_batch_x2(const std::string &model, const std::string &target_device, const int &n);
+void test_set_input_params(const std::string &model, const std::string &target_device, const int &n);
+void test_create_exenetwork(const std::string &model, const std::string &target_device, const int &n);
+void test_create_infer_request(const std::string &model, const std::string &target_device, const int &n);
+void test_infer_request_inference(const std::string &model, const std::string &target_device, const int &n);
+// tests_pipelines/tests_pipelines.cpp
+
+// tests_pipelines/tests_pipelines_full_pipeline.cpp
+void test_load_unload_plugin_full_pipeline(const std::string &model, const std::string &target_device, const int &n);
+void test_read_network_full_pipeline(const std::string &model, const std::string &target_device, const int &n);
+void test_create_cnnnetwork_full_pipeline(const std::string &model, const std::string &target_device, const int &n);
+void test_set_input_params_full_pipeline(const std::string &model, const std::string &target_device, const int &n);
+void test_cnnnetwork_reshape_batch_x2_full_pipeline(const std::string &model, const std::string &target_device, const int &n);
+void test_create_exenetwork_full_pipeline(const std::string &model, const std::string &target_device, const int &n);
+void test_create_infer_request_full_pipeline(const std::string &model, const std::string &target_device, const int &n);
+void test_infer_request_inference_full_pipeline(const std::string &model, const std::string &target_device, const int &n);
+// tests_pipelines/tests_pipelines_full_pipeline.cpp
--- /dev/null
+#include "tests_pipelines.h"
+
+#include <string>
+
+#include <inference_engine.hpp>
+
+using namespace InferenceEngine;
+
+#define batchIndex 0
+
+#define setInputParameters() \
+ input.second->getPreProcess().setResizeAlgorithm(NO_RESIZE); \
+ input.second->setPrecision(Precision::U8); \
+ if (input.second->getInputData()->getTensorDesc().getDims().size() == 4) \
+ input.second->setLayout(Layout::NCHW); \
+ else if (input.second->getInputData()->getTensorDesc().getDims().size() == 2) \
+ input.second->setLayout(Layout::NC);
+
+#define computeShapesToReshape() \
+ auto layout = input.second->getTensorDesc().getLayout(); \
+ if ((layout == Layout::NCHW) || (layout == Layout::NC)) { \
+ shapes[input.first][batchIndex] *= 2; \
+ doReshape = true; \
+ }
+
+#define reshapeCNNNetwork() \
+ if (doReshape) \
+ cnnNetwork.reshape(shapes); \
+ else \
+ throw std::logic_error("Reshape wasn't applied for a model.");
+
+void test_load_unload_plugin_full_pipeline(const std::string &model, const std::string &target_device, const int &n) {
+ log_info("Load/unload plugin for device: " << target_device << " for " << n << " times");
+ Core ie;
+ for (int i = 0; i < n; i++) {
+ if (i == n / 2) {
+ log_info("Half of the test have already passed");
+ }
+ // GetVersions silently register plugin in `plugins` through `GetCPPPluginByName`
+ ie.GetVersions(target_device);
+ // Remove plugin for target_device from `plugins`
+ ie.UnregisterPlugin(target_device);
+ }
+ CNNNetwork cnnNetwork = ie.ReadNetwork(model);
+ InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
+ ICNNNetwork::InputShapes shapes = cnnNetwork.getInputShapes();
+ bool doReshape = false;
+ for (auto &input : inputInfo) {
+ setInputParameters();
+ computeShapesToReshape();
+ }
+ reshapeCNNNetwork();
+ ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, target_device);
+ InferRequest infer_request = exeNetwork.CreateInferRequest();
+ infer_request.Infer();
+ OutputsDataMap output_info(cnnNetwork.getOutputsInfo());
+ for (auto &output : output_info)
+ Blob::Ptr outputBlob = infer_request.GetBlob(output.first);
+}
+
+void test_read_network_full_pipeline(const std::string &model, const std::string &target_device, const int &n) {
+ log_info("Read network: \"" << model << "\" for " << n << " times");
+ Core ie;
+ IE_SUPPRESS_DEPRECATED_START
+ std::shared_ptr<CNNNetReader> netReaderPtr;
+ for (int i = 0; i < n; i++) {
+ if (i == n / 2) {
+ log_info("Half of the test have already passed");
+ }
+ CNNNetReader netReader;
+ netReader.ReadNetwork(model);
+ netReader.ReadWeights(fileNameNoExt(model) + ".bin");
+ netReaderPtr = std::make_shared<CNNNetReader>(netReader);
+ }
+ CNNNetwork cnnNetwork = netReaderPtr->getNetwork();
+ IE_SUPPRESS_DEPRECATED_END
+ InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
+ ICNNNetwork::InputShapes shapes = cnnNetwork.getInputShapes();
+ bool doReshape = false;
+ for (auto &input : inputInfo) {
+ setInputParameters();
+ computeShapesToReshape();
+ }
+ reshapeCNNNetwork();
+ ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, target_device);
+ InferRequest infer_request = exeNetwork.CreateInferRequest();
+ infer_request.Infer();
+ OutputsDataMap output_info(cnnNetwork.getOutputsInfo());
+ for (auto &output : output_info)
+ Blob::Ptr outputBlob = infer_request.GetBlob(output.first);
+}
+
+void test_create_cnnnetwork_full_pipeline(const std::string &model, const std::string &target_device, const int &n) {
+ log_info("Create CNNNetwork from network: \"" << model << "\" for " << n << " times");
+ Core ie;
+ CNNNetwork cnnNetwork;
+ for (int i = 0; i < n; i++) {
+ if (i == n / 2) {
+ log_info("Half of the test have already passed");
+ }
+ cnnNetwork = ie.ReadNetwork(model);
+ }
+ InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
+ ICNNNetwork::InputShapes shapes = cnnNetwork.getInputShapes();
+ bool doReshape = false;
+ for (auto &input : inputInfo) {
+ setInputParameters();
+ computeShapesToReshape();
+ }
+ reshapeCNNNetwork();
+ ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, target_device);
+ InferRequest infer_request = exeNetwork.CreateInferRequest();
+ infer_request.Infer();
+ OutputsDataMap output_info(cnnNetwork.getOutputsInfo());
+ for (auto &output : output_info)
+ Blob::Ptr outputBlob = infer_request.GetBlob(output.first);
+}
+
+void test_set_input_params_full_pipeline(const std::string &model, const std::string &target_device, const int &n) {
+ log_info("Apply preprocessing for CNNNetwork from network: \"" << model << "\" for " << n << " times");
+ Core ie;
+ CNNNetwork cnnNetwork = ie.ReadNetwork(model);
+ InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
+ for (int i = 0; i < n; i++) {
+ if (i == n / 2) {
+ log_info("Half of the test have already passed");
+ }
+ for (auto &input : inputInfo) {
+ setInputParameters();
+ }
+ }
+ ICNNNetwork::InputShapes shapes = cnnNetwork.getInputShapes();
+ bool doReshape = false;
+ for (auto &input : inputInfo) {
+ computeShapesToReshape();
+ }
+ reshapeCNNNetwork();
+ ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, target_device);
+ InferRequest infer_request = exeNetwork.CreateInferRequest();
+ infer_request.Infer();
+ OutputsDataMap output_info(cnnNetwork.getOutputsInfo());
+ for (auto &output : output_info)
+ Blob::Ptr outputBlob = infer_request.GetBlob(output.first);
+}
+
+void test_cnnnetwork_reshape_batch_x2_full_pipeline(const std::string &model, const std::string &target_device, const int &n) {
+ log_info("Reshape to batch*=2 of CNNNetwork created from network: \"" << model << "\" for " << n << " times");
+ Core ie;
+ CNNNetwork cnnNetwork = ie.ReadNetwork(model);
+ InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
+ for (auto &input : inputInfo) {
+ setInputParameters();
+ }
+ ICNNNetwork::InputShapes shapes = cnnNetwork.getInputShapes();
+ bool doReshape = false;
+ int prev_batch = -1, new_batch;
+ for (auto &input : inputInfo) {
+ auto layout = input.second->getTensorDesc().getLayout();
+ if ((layout == Layout::NCHW) || (layout == Layout::NC))
+ prev_batch = shapes[input.first][batchIndex];
+ }
+ if (prev_batch == -1)
+ throw std::logic_error("Reshape wasn't applied for a model.");
+
+ for (int i = 0; i < n; i++) {
+ if (i == n / 2) {
+ log_info("Half of the test have already passed");
+ }
+
+ new_batch = ((i % 2) == 0) ? prev_batch * 2 : prev_batch;
+ for (auto &input : inputInfo) {
+ auto layout = input.second->getTensorDesc().getLayout();
+ if ((layout == Layout::NCHW) || (layout == Layout::NC)) {
+ shapes[input.first][batchIndex] = new_batch;
+ doReshape = true;
+ }
+ }
+ reshapeCNNNetwork();
+ }
+ ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, target_device);
+ InferRequest infer_request = exeNetwork.CreateInferRequest();
+ infer_request.Infer();
+ OutputsDataMap output_info(cnnNetwork.getOutputsInfo());
+ for (auto &output : output_info)
+ Blob::Ptr outputBlob = infer_request.GetBlob(output.first);
+}
+
+void test_create_exenetwork_full_pipeline(const std::string &model, const std::string &target_device, const int &n) {
+ log_info("Create ExecutableNetwork from network: \"" << model
+ << "\" for device: \"" << target_device << "\" for " << n << " times");
+ Core ie;
+ CNNNetwork cnnNetwork = ie.ReadNetwork(model);
+ InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
+ ICNNNetwork::InputShapes shapes = cnnNetwork.getInputShapes();
+ bool doReshape = false;
+ for (auto &input : inputInfo) {
+ setInputParameters();
+ computeShapesToReshape();
+ }
+ reshapeCNNNetwork();
+ ExecutableNetwork exeNetwork;
+ for (int i = 0; i < n; i++) {
+ if (i == n / 2) {
+ log_info("Half of the test have already passed");
+ }
+ exeNetwork = ie.LoadNetwork(cnnNetwork, target_device);
+ }
+ InferRequest infer_request = exeNetwork.CreateInferRequest();
+ infer_request.Infer();
+ OutputsDataMap output_info(cnnNetwork.getOutputsInfo());
+ for (auto &output : output_info)
+ Blob::Ptr outputBlob = infer_request.GetBlob(output.first);
+}
+
+void test_create_infer_request_full_pipeline(const std::string &model, const std::string &target_device, const int &n) {
+ log_info("Create InferRequest from network: \"" << model
+ << "\" for device: \"" << target_device << "\" for " << n << " times");
+ Core ie;
+ CNNNetwork cnnNetwork = ie.ReadNetwork(model);
+ InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
+ ICNNNetwork::InputShapes shapes = cnnNetwork.getInputShapes();
+ bool doReshape = false;
+ for (auto &input : inputInfo) {
+ setInputParameters();
+ computeShapesToReshape();
+ }
+ reshapeCNNNetwork();
+ ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, target_device);
+ InferRequest infer_request;
+ for (int i = 0; i < n; i++) {
+ if (i == n / 2) {
+ log_info("Half of the test have already passed");
+ }
+ infer_request = exeNetwork.CreateInferRequest();
+ }
+ infer_request.Infer();
+ OutputsDataMap output_info(cnnNetwork.getOutputsInfo());
+ for (auto &output : output_info)
+ Blob::Ptr outputBlob = infer_request.GetBlob(output.first);
+}
+
+void test_infer_request_inference_full_pipeline(const std::string &model, const std::string &target_device, const int &n) {
+ log_info("Inference of InferRequest from network: \"" << model
+ << "\" for device: \"" << target_device << "\" for " << n << " times");
+ Core ie;
+ CNNNetwork cnnNetwork = ie.ReadNetwork(model);
+ InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
+ ICNNNetwork::InputShapes shapes = cnnNetwork.getInputShapes();
+ bool doReshape = false;
+ for (auto &input : inputInfo) {
+ setInputParameters();
+ computeShapesToReshape();
+ }
+ reshapeCNNNetwork();
+ ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, target_device);
+ InferRequest infer_request = exeNetwork.CreateInferRequest();
+ for (int i = 0; i < n; i++) {
+ if (i == n / 2) {
+ log_info("Half of the test have already passed");
+ }
+ infer_request.Infer();
+ OutputsDataMap output_info(cnnNetwork.getOutputsInfo());
+ for (auto &output : output_info)
+ Blob::Ptr outputBlob = infer_request.GetBlob(output.first);
+ }
+}
Default value is determined automatically for a device.
Please note that although the automatic selection usually provides a reasonable performance,
it still may be non-optimal for some cases, especially for very small networks.
+ -enforcebf16 [ENFORCE_BFLOAT16], --enforce_bfloat16 [ENFORCE_BFLOAT16]
+ Optional. Enforcing of floating point operations
+ execution in bfloat16 precision where it is acceptable.
-nthreads NUMBER_THREADS, --number_threads NUMBER_THREADS
Number of threads to use for inference on the CPU
(including HETERO and MULTI cases).
- -pin {YES,NO}, --infer_threads_pinning {YES,NO}
- Optional. Enable ("YES" is default value) or disable
- ("NO")CPU threads pinning for CPU-involved inference.
+ -pin {YES,NO,NUMA}, --infer_threads_pinning {YES,NO,NUMA}
+ Optional. Enable threads->cores ('YES' is default
+ value), threads->(NUMA)nodes ('NUMA') or completely
+ disable ('NO')CPU threads pinning for CPU-involved
+ inference.
--exec_graph_path EXEC_GRAPH_PATH
Optional. Path to a file where to store executable
graph information serialized.
-pc [PERF_COUNTS], --perf_counts [PERF_COUNTS]
Optional. Report performance counters.
-
+ -dump_config DUMP_CONFIG
+ Optional. Path to JSON file to dump IE parameters,
+ which were set by application.
+ -load_config LOAD_CONFIG
+ Optional. Path to JSON file to load custom IE
+ parameters. Please note, command line parameters have
+ higher priority then parameters from configuration
+ file.
```
Running the application with the empty list of options yields the usage message given above and an error message.
from statistics import median
from openvino.inference_engine import IENetwork, IECore, get_version, StatusCode
-from .utils.constants import CPU_DEVICE_NAME, MULTI_DEVICE_NAME, GPU_DEVICE_NAME, MYRIAD_DEVICE_NAME, BIN_EXTENSION
+from .utils.constants import MULTI_DEVICE_NAME, HETERO_DEVICE_NAME, CPU_DEVICE_NAME, GPU_DEVICE_NAME, BIN_EXTENSION
from .utils.logging import logger
-from .utils.utils import get_duration_seconds, parse_nstreams_value_per_device, parse_devices
+from .utils.utils import get_duration_seconds
from .utils.inputs_filling import get_blob_shape
-
+from .utils.statistics_report import StatisticsReport
class Benchmark:
- def __init__(self, device: str, number_infer_requests, number_iterations, duration_seconds, api_type):
+ def __init__(self, device: str, number_infer_requests: int = None, number_iterations: int = None,
+ duration_seconds: int = None, api_type: str = 'async'):
self.device = device
self.ie = IECore()
self.nireq = number_infer_requests
self.niter = number_iterations
self.duration_seconds = get_duration_seconds(duration_seconds, self.niter, self.device)
self.api_type = api_type
- self.device_number_streams = {}
def __del__(self):
del self.ie
def add_extension(self, path_to_extension: str=None, path_to_cldnn_config: str=None):
- if GPU_DEVICE_NAME in self.device:
- if path_to_cldnn_config:
- self.ie.set_config({'CONFIG_FILE': path_to_cldnn_config}, GPU_DEVICE_NAME)
- logger.info('GPU extensions is loaded {}'.format(path_to_cldnn_config))
- if CPU_DEVICE_NAME in self.device or MYRIAD_DEVICE_NAME in self.device:
- if path_to_extension:
- self.ie.add_extension(extension_path=path_to_extension, device_name=CPU_DEVICE_NAME)
- logger.info('CPU extensions is loaded {}'.format(path_to_extension))
+ if path_to_cldnn_config:
+ self.ie.set_config({'CONFIG_FILE': path_to_cldnn_config}, GPU_DEVICE_NAME)
+ logger.info('GPU extensions is loaded {}'.format(path_to_cldnn_config))
+
+ if path_to_extension:
+ self.ie.add_extension(extension_path=path_to_extension, device_name=CPU_DEVICE_NAME)
+ logger.info('CPU extensions is loaded {}'.format(path_to_extension))
def get_version_info(self) -> str:
logger.info('InferenceEngine:\n{: <9}{:.<24} {}'.format('', 'API version', get_version()))
logger.info('Resizing network to batch = {}'.format(batch_size))
ie_network.reshape(new_shapes)
- def set_config(self, number_streams: int, api_type: str = 'async',
- number_threads: int = None, infer_threads_pinning: int = None):
- devices = parse_devices(self.device)
- self.device_number_streams = parse_nstreams_value_per_device(devices, number_streams)
- for device_name in self.device_number_streams.keys():
- key = device_name + "_THROUGHPUT_STREAMS"
- supported_config_keys = self.ie.get_metric(device_name, 'SUPPORTED_CONFIG_KEYS')
- if key not in supported_config_keys:
- raise Exception("Device " + device_name + " doesn't support config key '" + key + "'! " +
- "Please specify -nstreams for correct devices in format <dev1>:<nstreams1>,<dev2>:<nstreams2>");
-
- for device in devices:
- if device == CPU_DEVICE_NAME: # CPU supports few special performance-oriented keys
- # limit threading for CPU portion of inference
- if number_threads:
- self.ie.set_config({'CPU_THREADS_NUM': str(number_threads)}, device)
-
- if MULTI_DEVICE_NAME in self.device and GPU_DEVICE_NAME in self.device:
- self.ie.set_config({'CPU_BIND_THREAD': 'NO'}, CPU_DEVICE_NAME)
- else:
- # pin threads for CPU portion of inference
- self.ie.set_config({'CPU_BIND_THREAD': infer_threads_pinning}, device)
-
- # for CPU execution, more throughput-oriented execution via streams
- # for pure CPU execution, more throughput-oriented execution via streams
- if api_type == 'async':
- cpu_throughput = {'CPU_THROUGHPUT_STREAMS': 'CPU_THROUGHPUT_AUTO'}
- if device in self.device_number_streams.keys():
- cpu_throughput['CPU_THROUGHPUT_STREAMS'] = str(self.device_number_streams.get(device))
- self.ie.set_config(cpu_throughput, device)
- self.device_number_streams[device] = self.ie.get_config(device, 'CPU_THROUGHPUT_STREAMS')
-
- elif device == GPU_DEVICE_NAME:
- if api_type == 'async':
- gpu_throughput = {'GPU_THROUGHPUT_STREAMS': 'GPU_THROUGHPUT_AUTO'}
- if device in self.device_number_streams.keys():
- gpu_throughput['GPU_THROUGHPUT_STREAMS'] = str(self.device_number_streams.get(device))
- self.ie.set_config(gpu_throughput, device)
- self.device_number_streams[device] = self.ie.get_config(device, 'GPU_THROUGHPUT_STREAMS')
-
- if MULTI_DEVICE_NAME in self.device and CPU_DEVICE_NAME in self.device:
- # multi-device execution with the CPU+GPU performs best with GPU trottling hint,
- # which releases another CPU thread (that is otherwise used by the GPU driver for active polling)
- self.ie.set_config({'CLDNN_PLUGIN_THROTTLE': '1'}, device)
-
- elif device == MYRIAD_DEVICE_NAME:
- self.ie.set_config({'LOG_LEVEL': 'LOG_INFO'}, MYRIAD_DEVICE_NAME)
+ def set_config(self, config = {}):
+ for device in config.keys():
+ self.ie.set_config(config[device], device)
def read_network(self, path_to_model: str):
xml_filename = os.path.abspath(path_to_model)
- head, tail = os.path.splitext(xml_filename)
+ head, _ = os.path.splitext(xml_filename)
bin_filename = os.path.abspath(head + BIN_EXTENSION)
ie_network = self.ie.read_network(xml_filename, bin_filename)
return ie_network
- def load_network(self, ie_network: IENetwork, perf_counts: bool):
- config = {'PERF_COUNT': ('YES' if perf_counts else 'NO')}
-
+ def load_network(self, ie_network: IENetwork, config = {}):
exe_network = self.ie.load_network(ie_network,
self.device,
config=config,
num_requests=1 if self.api_type == 'sync' else self.nireq or 0)
# Number of requests
self.nireq = len(exe_network.requests)
+
return exe_network
def infer(self, exe_network, batch_size, progress_bar=None):
from openvino.tools.benchmark.benchmark import Benchmark
from openvino.tools.benchmark.parameters import parse_args
-from openvino.tools.benchmark.utils.constants import MULTI_DEVICE_NAME
+from openvino.tools.benchmark.utils.constants import MULTI_DEVICE_NAME, HETERO_DEVICE_NAME, CPU_DEVICE_NAME, GPU_DEVICE_NAME, MYRIAD_DEVICE_NAME, BIN_EXTENSION
from openvino.tools.benchmark.utils.inputs_filling import set_inputs
from openvino.tools.benchmark.utils.logging import logger
from openvino.tools.benchmark.utils.progress_bar import ProgressBar
from openvino.tools.benchmark.utils.utils import next_step, config_network_inputs, get_number_iterations, \
process_help_inference_string, print_perf_counters, dump_exec_graph, get_duration_in_milliseconds, \
- get_command_line_arguments
+ get_command_line_arguments, parse_nstreams_value_per_device, parse_devices, load_config, dump_config
from openvino.tools.benchmark.utils.statistics_report import StatisticsReport, averageCntReport, detailedCntReport
-
def main():
# ------------------------------ 1. Parsing and validating input arguments -------------------------------------
next_step()
"Although the automatic selection usually provides a reasonable performance, "
"but it still may be non-optimal for some cases, for more information look at README. ")
+ command_line_arguments = get_command_line_arguments(sys.argv)
if args.report_type:
statistics = StatisticsReport(StatisticsReport.Config(args.report_type, args.report_folder))
- statistics.add_parameters(StatisticsReport.Category.COMMAND_LINE_PARAMETERS, get_command_line_arguments(sys.argv))
+ statistics.add_parameters(StatisticsReport.Category.COMMAND_LINE_PARAMETERS, command_line_arguments)
+
+ def is_flag_set_in_command_line(flag):
+ return any(x.strip('-') == flag for x, y in command_line_arguments)
+
+ device_name = args.target_device
+ devices = parse_devices(device_name)
+ device_number_streams = parse_nstreams_value_per_device(devices, args.number_streams)
+
+ config = {}
+ if args.load_config:
+ load_config(args.load_config, config)
# ------------------------------ 2. Loading Inference Engine ---------------------------------------------------
next_step(step_id=2)
- device_name = args.target_device.upper()
-
benchmark = Benchmark(args.target_device, args.number_infer_requests,
args.number_iterations, args.time, args.api_type)
- benchmark.add_extension(args.path_to_extension, args.path_to_cldnn_config)
+ ## CPU (MKLDNN) extensions
+ if CPU_DEVICE_NAME in device_name and args.path_to_extension:
+ benchmark.add_extension(path_to_extension=args.path_to_extension)
+
+ ## GPU (clDNN) Extensions
+ if GPU_DEVICE_NAME in device_name and args.path_to_cldnn_config:
+ if GPU_DEVICE_NAME not in config.keys():
+ config[GPU_DEVICE_NAME] = {}
+ config[GPU_DEVICE_NAME]['CONFIG_FILE'] = args.path_to_cldnn_config
+
+ if GPU_DEVICE_NAME in config.keys() and 'CONFIG_FILE' in config[GPU_DEVICE_NAME].keys():
+ cldnn_config = config[GPU_DEVICE_NAME]['CONFIG_FILE']
+ benchmark.add_extension(path_to_cldnn_config=cldnn_config)
version = benchmark.get_version_info()
# --------------------- 6. Setting device configuration --------------------------------------------------------
next_step()
- benchmark.set_config(args.number_streams, args.api_type, args.number_threads,
- args.infer_threads_pinning)
+
+ perf_counts = False
+ for device in devices:
+ if device not in config.keys():
+ config[device] = {}
+ ## Set performance counter
+ if is_flag_set_in_command_line('pc'):
+ ## set to user defined value
+ config[device]['PERF_COUNT'] = 'YES' if args.perf_counts else 'NO'
+ elif 'PERF_COUNT' in config[device].keys() and config[device]['PERF_COUNT'] == 'YES':
+ logger.warn("Performance counters for {} device is turned on. ".format(device) +
+ "To print results use -pc option.")
+ elif args.report_type in [ averageCntReport, detailedCntReport ]:
+ logger.warn("Turn on performance counters for {} device ".format(device) +
+ "since report type is {}.".format(args.report_type))
+ config[device]['PERF_COUNT'] = 'YES'
+ elif args.exec_graph_path is not None:
+ logger.warn("Turn on performance counters for {} device ".format(device) +
+ "due to execution graph dumping.")
+ config[device]['PERF_COUNT'] = 'YES'
+ else:
+ ## set to default value
+ config[device]['PERF_COUNT'] = 'YES' if args.perf_counts else 'NO'
+ perf_counts = True if config[device]['PERF_COUNT'] == 'YES' else perf_counts
+
+ def set_throughput_streams():
+ key = device + "_THROUGHPUT_STREAMS"
+ if device in device_number_streams.keys():
+ ## set to user defined value
+ supported_config_keys = benchmark.ie.get_metric(device, 'SUPPORTED_CONFIG_KEYS')
+ if key not in supported_config_keys:
+ raise Exception("Device {} doesn't support config key '{}'! ".format(device, key) +
+ "Please specify -nstreams for correct devices in format <dev1>:<nstreams1>,<dev2>:<nstreams2>")
+ config[device][key] = device_number_streams[device]
+ elif key not in config[device].keys() and args.api_type == "async":
+ logger.warn("-nstreams default value is determined automatically for {} device. ".format(device) +
+ "Although the automatic selection usually provides a reasonable performance,"
+ "but it still may be non-optimal for some cases, for more information look at README.")
+ config[device][key] = device + "_THROUGHPUT_AUTO"
+ if key in config[device].keys():
+ device_number_streams[device] = config[device][key]
+
+ if device == CPU_DEVICE_NAME: # CPU supports few special performance-oriented keys
+ # limit threading for CPU portion of inference
+ if args.number_threads and is_flag_set_in_command_line("nthreads"):
+ config[device]['CPU_THREADS_NUM'] = str(args.number_threads)
+
+ if is_flag_set_in_command_line("enforcebf16") or is_flag_set_in_command_line("enforce_bfloat16"):
+ config[device]['ENFORCE_BF16'] = 'YES' if args.enforce_bfloat16 else 'NO'
+
+ if is_flag_set_in_command_line('pin'):
+ ## set to user defined value
+ config[device]['CPU_BIND_THREAD'] = args.infer_threads_pinning
+ elif 'CPU_BIND_THREAD' not in config[device].keys():
+ if MULTI_DEVICE_NAME in device_name and GPU_DEVICE_NAME in device_name:
+ logger.warn("Turn off threads pinning for {}".format(device) +
+ "device since multi-scenario with GPU device is used.")
+ config[device]['CPU_BIND_THREAD'] = 'NO'
+ else:
+ ## set to default value
+ config[device]['CPU_BIND_THREAD'] = args.infer_threads_pinning
+
+ ## for CPU execution, more throughput-oriented execution via streams
+ set_throughput_streams()
+ elif device == GPU_DEVICE_NAME:
+ ## for GPU execution, more throughput-oriented execution via streams
+ set_throughput_streams()
+
+ if MULTI_DEVICE_NAME in device_name and CPU_DEVICE_NAME in device_name:
+ logger.warn("Turn on GPU trottling. Multi-device execution with the CPU + GPU performs best with GPU trottling hint, " +
+ "which releases another CPU thread (that is otherwise used by the GPU driver for active polling)")
+ config[device]['CLDNN_PLUGIN_THROTTLE'] = '1'
+ elif device == MYRIAD_DEVICE_NAME:
+ config[device]['LOG_LEVEL'] = 'LOG_INFO'
+ perf_counts = perf_counts
+
+ benchmark.set_config(config)
# --------------------- 7. Loading the model to the device -----------------------------------------------------
next_step()
start_time = datetime.utcnow()
- perf_counts = True if args.perf_counts or \
- args.report_type in [ averageCntReport, detailedCntReport ] or \
- args.exec_graph_path else False
- exe_network = benchmark.load_network(ie_network, perf_counts)
+ exe_network = benchmark.load_network(ie_network)
duration_ms = "{:.2f}".format((datetime.utcnow() - start_time).total_seconds() * 1000)
logger.info("Load network took {} ms".format(duration_ms))
if statistics:
[
('load network time (ms)', duration_ms)
])
+ ## Update number of streams
+ for device in device_number_streams.keys():
+ key = device + '_THROUGHPUT_STREAMS'
+ device_number_streams[device] = benchmark.ie.get_config(device, key)
# --------------------- 8. Setting optimal runtime parameters --------------------------------------------------
next_step()
('topology', ie_network.name),
('target device', device_name),
('API', args.api_type),
- ('precision', str(ie_network.precision)),
+ ('precision', "UNSPECIFIED"),
('batch size', str(batch_size)),
('number of iterations', str(benchmark.niter) if benchmark.niter else "0"),
('number of parallel infer requests', str(benchmark.nireq)),
('duration (ms)', str(get_duration_in_milliseconds(benchmark.duration_seconds))),
])
- for nstreams in benchmark.device_number_streams.items():
+ for nstreams in device_number_streams.items():
statistics.add_parameters(StatisticsReport.Category.RUNTIME_CONFIG,
[
("number of {} streams".format(nstreams[0]), str(nstreams[1])),
# ------------------------------------ 11. Dumping statistics report -------------------------------------------
next_step()
+ if args.dump_config:
+ dump_config(args.dump_config, config)
+ logger.info("Inference Engine configuration settings were dumped to {}".format(args.dump_config))
+
if args.exec_graph_path:
dump_exec_graph(exe_network, args.exec_graph_path)
from openvino.tools.benchmark.utils.constants import XML_EXTENSION_PATTERN
from openvino.tools.benchmark.utils.utils import show_available_devices
-
def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
'Default value is determined automatically for a device. Please note that although the automatic selection '
'usually provides a reasonable performance, it still may be non - optimal for some cases, especially for very small networks. '
'See samples README for more details.')
-
+ args.add_argument('-enforcebf16', '--enforce_bfloat16', type=str2bool, required=False, default=False, nargs='?', const=True,
+ help='Optional. Enforcing of floating point operations execution in bfloat16 precision where it is acceptable.')
args.add_argument('-nthreads', '--number_threads', type=int, required=False, default=None,
help='Number of threads to use for inference on the CPU '
'(including HETERO and MULTI cases).')
args.add_argument('-pin', '--infer_threads_pinning', type=str, required=False, default='YES', choices=['YES', 'NO', 'NUMA'],
help='Optional. Enable threads->cores (\'YES\' is default value), threads->(NUMA)nodes (\'NUMA\') or completely disable (\'NO\')'
'CPU threads pinning for CPU-involved inference.')
- args.add_argument('--exec_graph_path', type=str, required=False,
+ args.add_argument('-exec_graph_path', '--exec_graph_path', type=str, required=False,
help='Optional. Path to a file where to store executable graph information serialized.')
args.add_argument('-pc', '--perf_counts', type=str2bool, required=False, default=False, nargs='?', const=True,
help='Optional. Report performance counters.', )
- args.add_argument('--report_type', type=str, required=False,
+ args.add_argument('-report_type', '--report_type', type=str, required=False,
choices=['no_counters', 'average_counters', 'detailed_counters'],
help="Optional. Enable collecting statistics report. \"no_counters\" report contains "
"configuration options specified, resulting FPS and latency. \"average_counters\" "
"counters values for each layer from the network. \"detailed_counters\" report "
"extends \"average_counters\" report and additionally includes per-layer PM "
"counters and latency for each executed infer request.")
- args.add_argument('--report_folder', type=str, required=False, default='',
+ args.add_argument('-report_folder', '--report_folder', type=str, required=False, default='',
help="Optional. Path to a folder where statistics report is stored.")
+ args.add_argument('-dump_config', type=str, required=False, default='',
+ help="Optional. Path to JSON file to dump IE parameters, which were set by application.")
+ args.add_argument('-load_config', type=str, required=False, default='',
+ help="Optional. Path to JSON file to load custom IE parameters."
+ " Please note, command line parameters have higher priority then parameters from configuration file.")
parsed_args = parser.parse_args()
validate_args(parsed_args)
from .inputs_filling import is_image
from .logging import logger
+import json
def static_vars(**kwargs):
def decorate(func):
def parse_devices(device_string):
+ if device_string in ['MULTI', 'HETERO']:
+ return list()
devices = device_string
if ':' in devices:
devices = devices.partition(':')[2]
device_value_vec = device_value_string.split(':')
if len(device_value_vec) == 2:
device_name = device_value_vec[0]
- nstreams = int(device_value_vec[1])
+ nstreams = device_value_vec[1]
if device_name in devices:
result[device_name] = nstreams
else:
raise Exception("Can't set nstreams value " + str(nstreams) +
" for device '" + device_name + "'! Incorrect device name!");
elif len(device_value_vec) == 1:
- nstreams = int(device_value_vec[0])
+ nstreams = device_value_vec[0]
for device in devices:
result[device] = nstreams
elif not device_value_vec:
def show_available_devices():
ie = IECore()
- print("\nAvailable target devices: ", (" ".join(ie.available_devices)))
\ No newline at end of file
+ print("\nAvailable target devices: ", (" ".join(ie.available_devices)))
+
+def dump_config(filename, config):
+ with open(filename, 'w') as f:
+ json.dump(config, f, indent=4)
+
+def load_config(filename, config):
+ with open(filename) as f:
+ config.update(json.load(f))
\ No newline at end of file