From 9e45ab56bd165609118989c0d1bec309c3754560 Mon Sep 17 00:00:00 2001 From: Chunseok Lee Date: Thu, 30 Jul 2020 11:40:16 +0900 Subject: [PATCH] patch for rebase master on release/1.7.0 Change-Id: Id38b617d325ef7e854995a47f032bdf482a779b3 --- .ahub/tcchecker-tca/config.yaml | 43 + compiler/.ahub/tcchecker-tca/config.yaml | 54 + compiler/bcq-tools/CMakeLists.txt | 27 + compiler/bcq-tools/README.md | 78 + compiler/bcq-tools/generate_bcq_output_arrays | 90 + compiler/bcq-tools/preserve_bcq_info | 116 + compiler/circle-quantizer/CMakeLists.txt | 1 + compiler/circle-quantizer/requires.cmake | 1 + compiler/circle-quantizer/src/CircleQuantizer.cpp | 18 +- compiler/circle-tensordump/driver/Driver.cpp | 2 +- compiler/circle-tensordump/src/Dump.cpp | 48 +- compiler/circle-verify/src/Driver.cpp | 2 +- .../circle2circle-dredd-recipe-test/CMakeLists.txt | 93 +- .../circle2circle-dredd-recipe-test/requires.cmake | 4 +- compiler/circle2circle-dredd-recipe-test/test.lst | 3 +- .../circle2circle-dredd-recipe-test/testall.sh | 13 +- compiler/circle2circle/CMakeLists.txt | 2 + compiler/circle2circle/requires.cmake | 1 + compiler/circle2circle/src/Circle2Circle.cpp | 14 + compiler/circlechef/CMakeLists.txt | 4 +- compiler/circlechef/circle/src/RecipeChef.cpp | 2 + compiler/circlechef/core/src/ModelChef.cpp | 1 + compiler/circlechef/proto/circlechef.proto | 1 + compiler/circlechef/tools/file/Driver.cpp | 2 +- compiler/circlechef/tools/reverse/Driver.cpp | 2 +- compiler/circledump/driver/Driver.cpp | 2 +- compiler/circledump/src/OpPrinter.cpp | 15 + compiler/common-artifacts/exclude.lst | 31 +- compiler/hermes/src/hermes.test.cpp | 25 +- compiler/locomotiv/src/Node/BiasEncode.test.cpp | 14 +- compiler/locomotiv/src/Node/MatMul.test.cpp | 4 + compiler/locop/src/FormattedGraph.test.cpp | 2 + compiler/locop/src/FormattedTensorShape.test.cpp | 2 + .../include/luci_interpreter/core/Tensor.h | 9 +- compiler/luci-interpreter/src/core/KernelParams.h | 5 + .../luci-interpreter/src/kernels/CMakeLists.txt | 9 + .../luci-interpreter/src/kernels/DepthToSpace.cpp | 90 + .../luci-interpreter/src/kernels/DepthToSpace.h | 45 + .../src/kernels/DepthToSpace.test.cpp | 60 + .../src/kernels/L2Normalize.test.cpp | 9 +- .../src/kernels/LeakyRelu.test.cpp | 11 +- .../luci-interpreter/src/kernels/Logistic.test.cpp | 6 +- compiler/luci-interpreter/src/kernels/Reverse.cpp | 81 + compiler/luci-interpreter/src/kernels/Reverse.h | 43 + .../luci-interpreter/src/kernels/Reverse.test.cpp | 66 + compiler/luci-interpreter/src/kernels/Slice.cpp | 149 + compiler/luci-interpreter/src/kernels/Slice.h | 44 + .../luci-interpreter/src/kernels/Slice.test.cpp | 64 + .../src/kernels/TransposeConv.test.cpp | 23 +- .../luci-interpreter/src/loader/CMakeLists.txt | 7 + .../luci-interpreter/src/loader/GraphLoader.cpp | 23 +- compiler/luci-interpreter/src/loader/GraphLoader.h | 18 +- .../luci-interpreter/src/loader/KernelBuilder.cpp | 108 +- .../luci-interpreter/src/loader/KernelBuilder.h | 17 +- .../src/loader/KernelBuilder.test.cpp | 743 + .../luci-interpreter/src/loader/ModuleLoader.cpp | 7 +- .../luci-interpreter/src/loader/ModuleLoader.h | 5 - compiler/luci-value-test/evalverify.sh | 6 +- compiler/luci-value-test/test.lst | 110 +- .../luci/export/src/CircleOperationExporter.cpp | 2 +- compiler/luci/export/src/CircleTensorExporter.cpp | 5 +- compiler/luci/import/src/CircleReader.cpp | 2 + compiler/luci/import/src/Importer.test.cpp | 7 +- compiler/luci/import/src/Nodes/CircleLogistic.cpp | 14 - .../luci/import/src/Nodes/CircleTransposeConv.cpp | 18 + compiler/luci/lang/include/luci/IR/CircleNodes.lst | 1 + .../luci/lang/include/luci/IR/CircleQuantParam.h | 1 + compiler/luci/lang/src/Module.test.cpp | 2 +- compiler/luci/lang/src/Nodes/CircleCustom.test.cpp | 7 +- compiler/luci/lang/src/Nodes/CircleIf.test.cpp | 4 + compiler/luci/lang/src/Nodes/CircleWhile.test.cpp | 4 + compiler/luci/pass/src/CircleOptimizer.cpp | 4 +- compiler/luci/pass/src/FuseBCQPass.cpp | 426 +- compiler/luci/pass/src/QuantizationUtils.cpp | 7 + compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp | 21 +- compiler/luci/tests/test.lst | 9 + compiler/one-cmds/one-codegen | 25 +- compiler/one-cmds/one-import | 25 +- compiler/one-cmds/one-import-tf | 30 +- compiler/one-cmds/one-import-tflite | 20 +- compiler/one-cmds/one-optimize | 20 +- compiler/one-cmds/one-pack | 23 +- compiler/one-cmds/one-quantize | 23 +- compiler/one-cmds/requires.cmake | 1 + compiler/record-minmax/CMakeLists.txt | 5 + compiler/record-minmax/driver/Driver.cpp | 16 +- compiler/record-minmax/requires.cmake | 1 + compiler/record-minmax/src/HDF5Importer.cpp | 1 + compiler/record-minmax/src/MinMaxObserver.cpp | 3 +- compiler/record-minmax/src/RecordMinMax.cpp | 2 +- .../record-minmax/tests/RecordFunction.test.cpp | 14 + compiler/tfl-verify/CMakeLists.txt | 1 + compiler/tfl-verify/requires.cmake | 1 + compiler/tfl-verify/src/Driver.cpp | 19 +- compiler/tflchef/core/src/ModelChef.cpp | 1 + compiler/tflchef/proto/tflchef.proto | 1 + compiler/tflchef/tflite/src/RecipeChef.cpp | 2 + compiler/tflchef/tools/file/Driver.cpp | 2 +- compiler/tflchef/tools/reverse/Driver.cpp | 2 +- compiler/tfldump/driver/Driver.cpp | 2 +- compiler/tflite2circle/CMakeLists.txt | 1 + compiler/tflite2circle/driver/Driver.cpp | 17 +- compiler/tflite2circle/requires.cmake | 1 + compiler/vconone/CMakeLists.txt | 31 + compiler/vconone/README.md | 14 + compiler/vconone/driver/driver.cpp | 36 + compiler/vconone/include/vconone/vconone.h | 61 + compiler/vconone/src/version.cpp | 63 + compiler/vconone/src/version.test.cpp | 49 + compiler/vconone/version_cfg.h.in | 22 + .../core/CL/kernels/CLArgOperationKernel.h | 124 - .../arm_compute/core/CL/kernels/CLCastKernel.h | 121 - .../core/CL/kernels/CLDepthToSpaceKernel.h | 82 - .../CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h | 117 - .../arm_compute/core/CL/kernels/CLPReLUKernel.h | 83 - .../core/CL/kernels/CLSpaceToDepthKernel.h | 82 - .../kernels/CLTransposeConvLayerUpsampleKernel.h | 109 - .../core/CPP/kernels/CPPUpsampleKernelEx.h | 88 - .../arm_compute/core/NEON/kernels/NECastKernel.h | 96 - .../NEON/kernels/NEDepthToSpaceLayerKernelEx.h | 96 - .../core/NEON/kernels/NEElementwiseUnaryKernelEx.h | 118 - .../arm_compute/core/NEON/kernels/NEPReLUKernel.h | 100 - .../NEON/kernels/NESpaceToDepthLayerKernelEx.h | 97 - .../arm_compute/runtime/CL/CLFunctionsEx.h | 11 - .../runtime/CL/functions/CLArgOperation.h | 129 - .../runtime/CL/functions/CLBatchToSpaceND.h | 69 - .../arm_compute/runtime/CL/functions/CLCast.h | 75 - .../runtime/CL/functions/CLDepthToSpace.h | 68 - .../CL/functions/CLDirectTransposeConvLayer.h | 201 + .../CL/functions/CLFullyConnectedHybridLayer.h | 4 +- .../CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h | 142 - .../runtime/CL/functions/CLLogicalNot.h | 62 - .../arm_compute/runtime/CL/functions/CLPReLU.h | 64 - .../runtime/CL/functions/CLPixelWiseDivision.h | 103 - .../runtime/CL/functions/CLRNNLayerEx.h | 120 - .../runtime/CL/functions/CLSpaceToDepth.h | 68 - .../runtime/CL/functions/CLStridedSliceEx.h | 81 - .../runtime/CL/functions/CLTransposeConvLayer.h | 176 +- .../CL/functions/CLTransposeConvLayerUpsample.h | 102 - .../runtime/CPP/functions/CPPUpsampleEx.h | 65 - .../arm_compute/runtime/NEON/NEFunctionsEx.h | 7 - .../arm_compute/runtime/NEON/functions/NECast.h | 79 - .../runtime/NEON/functions/NEDepthToSpaceLayerEx.h | 78 - .../NEON/functions/NEElementwiseUnaryLayerEx.h | 70 - .../NEON/functions/NEFullyConnectedHybridLayer.h | 4 +- .../functions/NEGEMMLowpMatrixMultiplyCoreEx.h | 170 - .../arm_compute/runtime/NEON/functions/NEPReLU.h | 63 - .../runtime/NEON/functions/NERNNLayerEx.h | 130 - .../runtime/NEON/functions/NEReduceMeanEx.h | 99 - .../runtime/NEON/functions/NESpaceToBatchLayerEx.h | 136 - .../runtime/NEON/functions/NESpaceToDepthLayerEx.h | 79 - .../runtime/NEON/functions/NETransposeConvLayer.h | 68 +- .../ARMComputeEx/src/core/CL/CLKernelLibrary.cpp | 39 - .../src/core/CL/cl_kernels/arg_operation.cl | 137 - .../core/CL/cl_kernels/arithmetic_op_quantized.cl | 191 - .../ARMComputeEx/src/core/CL/cl_kernels/cast.cl | 233 - .../src/core/CL/cl_kernels/depth_to_space.cl | 185 - .../ARMComputeEx/src/core/CL/cl_kernels/helpers.h | 206 +- .../src/core/CL/cl_kernels/helpers_asymm.h | 185 +- .../ARMComputeEx/src/core/CL/cl_kernels/prelu.cl | 120 - .../src/core/CL/cl_kernels/prelu_quantized.cl | 138 - .../src/core/CL/cl_kernels/space_to_depth.cl | 185 - .../src/core/CL/kernels/CLArgOperationKernel.cpp | 181 - .../core/CL/kernels/CLBinaryLogicalOpKernel.cpp | 1 + .../src/core/CL/kernels/CLCastKernel.cpp | 132 - .../src/core/CL/kernels/CLDepthToSpaceKernel.cpp | 140 - .../core/CL/kernels/CLEmbeddingLookupKernel.cpp | 1 + .../kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp | 372 - .../src/core/CL/kernels/CLGatherExKernel.cpp | 1 + .../core/CL/kernels/CLHashtableLookupKernel.cpp | 3 +- .../CLInstanceNormalizationLayerKernelEx.cpp | 2 +- .../CL/kernels/CLMultiplyScaleFactorKernel.cpp | 1 + .../src/core/CL/kernels/CLNegKernel.cpp | 1 + .../src/core/CL/kernels/CLPReLUKernel.cpp | 210 - .../CL/kernels/CLQuantizationSymmetricKernel.cpp | 3 +- .../core/CL/kernels/CLReduceOperationKernel.cpp | 1 + .../core/CL/kernels/CLScaleFactorSymm8Kernel.cpp | 1 + .../src/core/CL/kernels/CLSpaceToDepthKernel.cpp | 148 - .../kernels/CLTransposeConvLayerUpsampleKernel.cpp | 188 - .../src/core/CPP/kernels/CPPUpsampleKernelEx.cpp | 118 - .../src/core/NEON/kernels/NECastKernel.cpp | 671 - .../NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp | 181 - .../NEON/kernels/NEElementwiseUnaryKernelEx.cpp | 221 - .../src/core/NEON/kernels/NEPReLUKernel.cpp | 291 - .../NEON/kernels/NEQuantizationSymmetricKernel.cpp | 2 +- .../NEON/kernels/NESpaceToDepthLayerKernelEx.cpp | 181 - .../src/runtime/CL/functions/CLArgOperation.cpp | 144 - .../src/runtime/CL/functions/CLBinaryLogicalOp.cpp | 2 +- .../src/runtime/CL/functions/CLCast.cpp | 52 - .../src/runtime/CL/functions/CLDepthToSpace.cpp | 52 - .../CL/functions/CLDirectTransposeConvLayer.cpp | 267 + .../src/runtime/CL/functions/CLEmbeddingLookup.cpp | 2 +- .../CL/functions/CLFullyConnectedHybridLayer.cpp | 16 +- .../CL/functions/CLFullyConnectedLayerEx.cpp | 4 +- .../functions/CLFullyConnectedReshapingLayer.cpp | 16 +- .../functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp | 180 - .../src/runtime/CL/functions/CLGatherEx.cpp | 2 +- .../src/runtime/CL/functions/CLHashtableLookup.cpp | 2 +- .../functions/CLInstanceNormalizationLayerEx.cpp | 2 +- .../src/runtime/CL/functions/CLPReLU.cpp | 63 - .../src/runtime/CL/functions/CLRNNLayerEx.cpp | 163 - .../src/runtime/CL/functions/CLReduceOperation.cpp | 8 +- .../src/runtime/CL/functions/CLSpaceToDepth.cpp | 52 - .../runtime/CL/functions/CLTransposeConvLayer.cpp | 250 +- .../CL/functions/CLTransposeConvLayerUpsample.cpp | 92 - .../src/runtime/CPP/functions/CPPOneHotEx.cpp | 4 +- .../src/runtime/CPP/functions/CPPUpsampleEx.cpp | 53 - .../runtime/NEON/functions/NEActivationLayerEx.cpp | 4 +- .../NEON/functions/NEBinaryLogicalOperation.cpp | 6 +- .../src/runtime/NEON/functions/NECast.cpp | 60 - .../NEON/functions/NEDepthToSpaceLayerEx.cpp | 63 - .../runtime/NEON/functions/NEEmbeddingLookup.cpp | 4 +- .../NEON/functions/NEFullyConnectedHybridLayer.cpp | 14 +- .../functions/NEFullyConnectedReshapingLayer.cpp | 7 +- .../functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp | 513 - .../src/runtime/NEON/functions/NEGatherEx.cpp | 4 +- .../runtime/NEON/functions/NEHashtableLookup.cpp | 4 +- .../src/runtime/NEON/functions/NEPReLU.cpp | 55 - .../src/runtime/NEON/functions/NERNNLayerEx.cpp | 161 - .../src/runtime/NEON/functions/NEReduceMeanEx.cpp | 180 - .../NEON/functions/NESpaceToBatchLayerEx.cpp | 114 - .../NEON/functions/NESpaceToDepthLayerEx.cpp | 64 - .../NEON/functions/NETransposeConvLayer.cpp | 231 +- compute/cker/CMakeLists.txt | 3 + compute/cker/include/cker/Types.h | 11 + compute/cker/include/cker/Utils.h | 62 + .../cker/include/cker/operation/FullyConnected.h | 13 +- compute/cker/include/cker/operation/L2Normalize.h | 94 + compute/cker/include/cker/operation/Logistic.h | 9 - compute/cker/include/cker/operation/Pad.h | 15 +- compute/cker/include/cker/operation/Quantize.h | 47 + compute/cker/include/cker/operation/SpaceToDepth.h | 71 + compute/cker/include/cker/ruy/RuySupport.h | 2 +- docs/howto/how-to-build-runtime.md | 6 +- docs/nnfw/howto/CrossBuildForAndroid.md | 4 +- docs/runtime/core.md | 4 +- docs/runtime/heterogeneous-execution.md | 4 +- infra/cmake/packages/ARMComputeSourceConfig.cmake | 2 +- infra/cmake/packages/FlatBuffersConfig.cmake | 3 +- infra/cmake/packages/HDF5Config.cmake | 1 + infra/cmake/packages/Pybind11Config.cmake | 21 + infra/cmake/packages/Pybind11SourceConfig.cmake | 18 + infra/docker/Dockerfile | 3 +- infra/docker/Dockerfile.1804 | 7 +- infra/nncc/CMakeLists.txt | 1 + infra/nncc/command/utcount | 2 +- .../packages/TensorFlowLite-2.2.0/CMakeLists.txt | 2 +- infra/nnfw/config/gbs.conf | 6 +- infra/packaging/preset/20200630 | 14 +- infra/packaging/res/tf2nnpkg.20200630 | 19 +- infra/scripts/build-tcm.sh | 24 + infra/scripts/compiler_modules.sh | 2 +- .../scripts/docker_build_cross_aarch64_runtime.sh | 2 +- infra/scripts/docker_build_cross_arm_runtime.sh | 2 +- .../docker_build_cross_arm_runtime_release.sh | 2 +- infra/scripts/docker_build_cross_coverage.sh | 2 +- infra/scripts/docker_build_nncc.sh | 10 + infra/scripts/docker_build_tizen_cross.sh | 2 +- infra/scripts/docker_collect_nnpkg_resources.sh | 2 +- infra/scripts/tizen_xu4_test.sh | 2 +- master_diff_1.7.0.patch | 30424 +++++++++++++++++++ packaging/nnfw.spec | 2 +- .../AveragePool2D_U8_000/test.recipe | 26 + .../AveragePool2D_U8_000/test.reverse | 0 .../DepthwiseConv2D_003/test.recipe | 44 + .../DepthwiseConv2D_003/test.reverse | 0 .../DepthwiseConv2D_003/test.rule | 3 + .../DepthwiseConv2D_U8_001/test.recipe | 61 + .../DepthwiseConv2D_U8_001/test.reverse | 0 .../L2Normalize_U8_000/test.recipe | 22 + .../L2Normalize_U8_000/test.reverse | 0 .../Logistic_U8_000/test.recipe | 19 + .../Logistic_U8_000/test.reverse | 0 .../TransposeConv_000/test.recipe | 2 +- res/TensorFlowLiteRecipes/Unique_000/test.recipe | 27 + res/TensorFlowLiteRecipes/Unique_000/test.reverse | 0 res/TensorFlowLiteRecipes/Unique_001/test.recipe | 27 + res/TensorFlowLiteRecipes/Unique_001/test.reverse | 0 res/TensorFlowLiteRecipes/Unique_002/test.recipe | 27 + res/TensorFlowLiteRecipes/Unique_002/test.reverse | 0 res/TensorFlowLiteRecipes/Unique_003/test.recipe | 27 + res/TensorFlowLiteRecipes/Unique_003/test.reverse | 0 .../Unique_U8_000/test.recipe | 28 + .../Unique_U8_000/test.reverse | 0 .../Unique_U8_001/test.recipe | 28 + .../Unique_U8_001/test.reverse | 0 runtime/libs/benchmark/CMakeLists.txt | 3 +- runtime/libs/benchmark/src/Result.cpp | 2 +- runtime/onert/api/include/nnfw.h | 18 +- runtime/onert/api/src/nnfw_api.cc | 1 + runtime/onert/api/src/nnfw_api_internal.cc | 31 +- runtime/onert/backend/acl_cl/KernelGenerator.cc | 804 +- runtime/onert/backend/acl_common/AclKernelGen.h | 269 + runtime/onert/backend/acl_neon/KernelGenerator.cc | 777 +- runtime/onert/backend/cpu/ConstantInitializer.cc | 35 +- runtime/onert/backend/cpu/ConstantInitializer.h | 9 + runtime/onert/backend/cpu/KernelGenerator.cc | 509 +- runtime/onert/backend/cpu/KernelGenerator.h | 3 + runtime/onert/backend/cpu/StaticTensorManager.cc | 104 + runtime/onert/backend/cpu/StaticTensorManager.h | 61 + runtime/onert/backend/cpu/Tensor.h | 15 +- runtime/onert/backend/cpu/TensorBuilder.cc | 18 +- runtime/onert/backend/cpu/TensorBuilder.h | 13 +- runtime/onert/backend/cpu/ops/CompareLayer.cc | 238 +- .../onert/backend/cpu/ops/FullyConnectedLayer.cc | 35 +- .../onert/backend/cpu/ops/FullyConnectedLayer.h | 3 + runtime/onert/backend/cpu/ops/L2NormLayer.cc | 71 + runtime/onert/backend/cpu/ops/L2NormLayer.h | 55 + runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc | 4 +- runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h | 7 +- runtime/onert/backend/cpu/ops/OperationUtils.h | 11 + runtime/onert/backend/cpu/ops/PadLayer.cc | 25 +- runtime/onert/backend/cpu/ops/PadLayer.h | 8 +- runtime/onert/backend/cpu/ops/QuantizeLayer.cc | 63 + runtime/onert/backend/cpu/ops/QuantizeLayer.h | 56 + runtime/onert/backend/cpu/ops/SliceLayer.cc | 16 +- runtime/onert/backend/cpu/ops/SliceLayer.h | 3 +- runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc | 70 + runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h | 54 + .../onert/core/include/backend/ITensorBuilder.h | 4 +- .../onert/core/include/backend/ITensorRegistry.h | 68 +- .../backend/cpu_common/StaticTensorManager.h | 4 +- .../core/include/compiler/StaticShapeInference.h | 1 + .../core/include/exec/DynamicShapeInference.h | 1 + runtime/onert/core/include/ir/Operations.Include.h | 1 + runtime/onert/core/include/ir/Operations.lst | 1 + .../onert/core/include/ir/operation/LogSoftmax.h | 2 +- runtime/onert/core/include/ir/operation/Pad.h | 2 +- runtime/onert/core/include/ir/operation/Quantize.h | 49 + .../backend/controlflow/DynamicTensorManager.cc | 14 +- .../src/backend/controlflow/KernelGenerator.cc | 22 +- .../core/src/backend/controlflow/TensorBuilder.cc | 6 +- .../core/src/backend/controlflow/UserTensor.h | 1 + .../src/backend/cpu_common/DynamicTensorManager.cc | 10 +- .../src/backend/cpu_common/StaticTensorManager.cc | 28 +- runtime/onert/core/src/compiler/ExecutorFactory.cc | 37 +- runtime/onert/core/src/compiler/ExecutorFactory.h | 3 + runtime/onert/core/src/compiler/HEScheduler.h | 10 +- .../onert/core/src/compiler/OperationValidator.cc | 161 +- .../onert/core/src/compiler/OperationValidator.h | 4 +- .../core/src/compiler/StaticShapeInference.cc | 5 + runtime/onert/core/src/compiler/TensorBuilders.h | 12 + .../onert/core/src/exec/DynamicShapeInference.cc | 5 + runtime/onert/core/src/exec/ExecutorBase.cc | 4 +- runtime/onert/core/src/interp/operations/Pad.cc | 4 +- runtime/onert/core/src/ir/LoweredGraph.cc | 3 - runtime/onert/core/src/ir/operation/Quantize.cc | 37 + .../core/src/ir/pass/PermutationEliminationPass.cc | 195 - .../core/src/ir/pass/PermutationEliminationPass.h | 86 - .../core/src/ir/pass/PermutationInsertionPass.cc | 15 +- .../frontend/base_loader/include/base_loader.h | 36 + .../frontend/nnapi/wrapper/OperationFactory.cc | 337 +- runtime/onert/test/core/exec/ExecInstance.cc | 94 +- tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl | 18 +- .../nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon | 19 +- tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu | 13 +- tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl | 18 +- tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon | 19 +- tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu | 13 +- tests/nnapi/nnapi_gtest.skip.noarch.interp | 16 + tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu | 13 +- .../specs/V1_0/l2_normalization_quant8_nnfw.mod.py | 30 + .../specs/{skip => }/V1_2/pad_v2_1_float.mod.py | 0 .../specs/{skip => }/V1_2/pad_v2_1_quant8.mod.py | 0 .../specs/{skip => }/V1_2/pad_v2_all_dims.mod.py | 0 .../{skip => }/V1_2/pad_v2_all_dims_quant8.mod.py | 0 .../specs/{skip => }/V1_2/pad_v2_low_rank.mod.py | 0 .../{skip => }/V1_2/pad_v2_low_rank_quant8.mod.py | 0 tests/nnapi/specs/{skip => }/V1_2/quantize.mod.py | 0 tests/nnfw_api/src/ValidationTestAddModelLoaded.cc | 19 +- .../src/ValidationTestAddSessionPrepared.cc | 6 +- tests/nnfw_api/src/ValidationTestSessionCreated.cc | 28 +- tests/scripts/benchmark_nnapi.sh | 23 +- tests/scripts/common.sh | 11 +- tests/scripts/framework/run_test.sh | 60 +- tests/scripts/test-driver.sh | 17 - tests/scripts/test_framework.sh | 10 +- tests/tools/nnpackage_run/CMakeLists.txt | 2 +- tests/tools/nnpackage_run/src/args.cc | 246 +- tests/tools/nnpackage_run/src/h5formatter.cc | 8 +- tests/tools/tflite_loader/CMakeLists.txt | 2 +- tests/tools/tflite_run/CMakeLists.txt | 2 +- .../nncc-tc-to-nnpkg-tc/nncc-tc-to-nnpkg-tc.sh | 5 +- tools/tflitefile_tool/select_operator.py | 21 +- tools/tflkit/README.md | 12 +- tools/update_version/update-version | 11 +- 386 files changed, 38093 insertions(+), 13423 deletions(-) create mode 100644 .ahub/tcchecker-tca/config.yaml create mode 100644 compiler/.ahub/tcchecker-tca/config.yaml create mode 100644 compiler/bcq-tools/CMakeLists.txt create mode 100644 compiler/bcq-tools/README.md create mode 100644 compiler/bcq-tools/generate_bcq_output_arrays create mode 100644 compiler/bcq-tools/preserve_bcq_info create mode 100644 compiler/luci-interpreter/src/kernels/DepthToSpace.cpp create mode 100644 compiler/luci-interpreter/src/kernels/DepthToSpace.h create mode 100644 compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp create mode 100644 compiler/luci-interpreter/src/kernels/Reverse.cpp create mode 100644 compiler/luci-interpreter/src/kernels/Reverse.h create mode 100644 compiler/luci-interpreter/src/kernels/Reverse.test.cpp create mode 100644 compiler/luci-interpreter/src/kernels/Slice.cpp create mode 100644 compiler/luci-interpreter/src/kernels/Slice.h create mode 100644 compiler/luci-interpreter/src/kernels/Slice.test.cpp create mode 100644 compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp create mode 100644 compiler/vconone/CMakeLists.txt create mode 100644 compiler/vconone/README.md create mode 100644 compiler/vconone/driver/driver.cpp create mode 100644 compiler/vconone/include/vconone/vconone.h create mode 100644 compiler/vconone/src/version.cpp create mode 100644 compiler/vconone/src/version.test.cpp create mode 100644 compiler/vconone/version_cfg.h.in delete mode 100644 compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h delete mode 100644 compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h delete mode 100644 compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h delete mode 100644 compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h delete mode 100644 compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h delete mode 100644 compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h delete mode 100644 compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h delete mode 100644 compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h delete mode 100644 compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h delete mode 100644 compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h delete mode 100644 compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h delete mode 100644 compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h delete mode 100644 compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h create mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h delete mode 100644 compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl delete mode 100644 compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl delete mode 100644 compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl delete mode 100644 compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl delete mode 100644 compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl delete mode 100644 compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl delete mode 100644 compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl delete mode 100644 compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp delete mode 100644 compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp delete mode 100644 compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp delete mode 100644 compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp delete mode 100644 compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp delete mode 100644 compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp delete mode 100644 compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp delete mode 100644 compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp delete mode 100644 compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp delete mode 100644 compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp delete mode 100644 compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp delete mode 100644 compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp delete mode 100644 compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp delete mode 100644 compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp delete mode 100644 compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp delete mode 100644 compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp create mode 100644 compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp delete mode 100644 compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp delete mode 100644 compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp delete mode 100644 compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp delete mode 100644 compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp delete mode 100644 compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp delete mode 100644 compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp delete mode 100644 compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp delete mode 100644 compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp delete mode 100644 compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp delete mode 100644 compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp delete mode 100644 compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp delete mode 100644 compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp delete mode 100644 compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp delete mode 100644 compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp create mode 100644 compute/cker/include/cker/operation/L2Normalize.h create mode 100644 compute/cker/include/cker/operation/Quantize.h create mode 100644 compute/cker/include/cker/operation/SpaceToDepth.h create mode 100644 infra/cmake/packages/Pybind11Config.cmake create mode 100644 infra/cmake/packages/Pybind11SourceConfig.cmake create mode 100644 infra/scripts/build-tcm.sh create mode 100644 master_diff_1.7.0.patch create mode 100644 res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.recipe create mode 100644 res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.reverse create mode 100644 res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.recipe create mode 100644 res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.reverse create mode 100644 res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.rule create mode 100644 res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.recipe create mode 100644 res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.reverse create mode 100644 res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.recipe create mode 100644 res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.reverse create mode 100644 res/TensorFlowLiteRecipes/Logistic_U8_000/test.recipe create mode 100644 res/TensorFlowLiteRecipes/Logistic_U8_000/test.reverse create mode 100644 res/TensorFlowLiteRecipes/Unique_000/test.recipe create mode 100644 res/TensorFlowLiteRecipes/Unique_000/test.reverse create mode 100644 res/TensorFlowLiteRecipes/Unique_001/test.recipe create mode 100644 res/TensorFlowLiteRecipes/Unique_001/test.reverse create mode 100644 res/TensorFlowLiteRecipes/Unique_002/test.recipe create mode 100644 res/TensorFlowLiteRecipes/Unique_002/test.reverse create mode 100644 res/TensorFlowLiteRecipes/Unique_003/test.recipe create mode 100644 res/TensorFlowLiteRecipes/Unique_003/test.reverse create mode 100644 res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe create mode 100644 res/TensorFlowLiteRecipes/Unique_U8_000/test.reverse create mode 100644 res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe create mode 100644 res/TensorFlowLiteRecipes/Unique_U8_001/test.reverse create mode 100644 runtime/onert/backend/acl_common/AclKernelGen.h create mode 100644 runtime/onert/backend/cpu/StaticTensorManager.cc create mode 100644 runtime/onert/backend/cpu/StaticTensorManager.h create mode 100644 runtime/onert/backend/cpu/ops/L2NormLayer.cc create mode 100644 runtime/onert/backend/cpu/ops/L2NormLayer.h create mode 100644 runtime/onert/backend/cpu/ops/QuantizeLayer.cc create mode 100644 runtime/onert/backend/cpu/ops/QuantizeLayer.h create mode 100644 runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc create mode 100644 runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h create mode 100644 runtime/onert/core/include/ir/operation/Quantize.h create mode 100644 runtime/onert/core/src/ir/operation/Quantize.cc delete mode 100644 runtime/onert/core/src/ir/pass/PermutationEliminationPass.cc delete mode 100644 runtime/onert/core/src/ir/pass/PermutationEliminationPass.h create mode 100644 tests/nnapi/specs/V1_0/l2_normalization_quant8_nnfw.mod.py rename tests/nnapi/specs/{skip => }/V1_2/pad_v2_1_float.mod.py (100%) rename tests/nnapi/specs/{skip => }/V1_2/pad_v2_1_quant8.mod.py (100%) rename tests/nnapi/specs/{skip => }/V1_2/pad_v2_all_dims.mod.py (100%) rename tests/nnapi/specs/{skip => }/V1_2/pad_v2_all_dims_quant8.mod.py (100%) rename tests/nnapi/specs/{skip => }/V1_2/pad_v2_low_rank.mod.py (100%) rename tests/nnapi/specs/{skip => }/V1_2/pad_v2_low_rank_quant8.mod.py (100%) rename tests/nnapi/specs/{skip => }/V1_2/quantize.mod.py (100%) diff --git a/.ahub/tcchecker-tca/config.yaml b/.ahub/tcchecker-tca/config.yaml new file mode 100644 index 0000000..cd34d79 --- /dev/null +++ b/.ahub/tcchecker-tca/config.yaml @@ -0,0 +1,43 @@ +version: 2 +test: + - name: NN Runtime + testCaseLanguage: CPP + testFW: GTEST + testCaseFolder: + - ./compute/test/cker + - ./runtime/onert/core/src/backend/cpu_common + - ./runtime/onert/frontend/nnapi + - ./runtime/onert/test/core/compiler + - ./runtime/onert/test/core/exec + - ./runtime/onert/test/core/interp + - ./runtime/onert/test/graph + - ./runtime/onert/test/graph/operand + - ./runtime/onert/test/graph/operation + - ./runtime/onert/test/graph/verifier + - ./runtime/onert/test/ir + - ./runtime/onert/test/util + - ./tests/nnapi/src + - ./tests/nnfw_api/src + - ./tests/tools/tflite_run/src + + testFile: + - extension: cpp + any: true + - extension: cc + any: true + + testCase: + - condition: + - functionName: + starts: + - TEST + + negativeTestCase: + - condition: + - testName: + starts: + - neg_ + + positiveTestCase: + - condition: + - inverse: negativeTestCase diff --git a/compiler/.ahub/tcchecker-tca/config.yaml b/compiler/.ahub/tcchecker-tca/config.yaml new file mode 100644 index 0000000..ef681de --- /dev/null +++ b/compiler/.ahub/tcchecker-tca/config.yaml @@ -0,0 +1,54 @@ +version: 2 +test: + - name: NN Compiler + testCaseLanguage: CPP + testFW: GTEST + testCaseFolder: + - ./angkor + - ./arser + - ./circle2circle + - ./circle-quantizer + - ./cwrap + - ./foder + - ./hermes + - ./hermes-std + - ./loco + - ./locomotiv + - ./locop + - ./logo + - ./logo-core + - ./luci + - ./luci-interpreter + - ./luci-value-test + - ./mio-circle + - ./mio-tflite + - ./oops + - ./pepper-assert + - ./pepper-str + - ./pepper-strcast + - ./pp + - ./record-minmax + - ./safemain + - ./souschef + - ./stdex + - ./tflite2circle + + testFile: + - extension: .test.cpp + any: true + + testCase: + - condition: + - functionName: + starts: + - TEST + + negativeTestCase: + - condition: + - testName: + ends: + - _NEG + + positiveTestCase: + - condition: + - inverse: negativeTestCase diff --git a/compiler/bcq-tools/CMakeLists.txt b/compiler/bcq-tools/CMakeLists.txt new file mode 100644 index 0000000..ae231bd --- /dev/null +++ b/compiler/bcq-tools/CMakeLists.txt @@ -0,0 +1,27 @@ +set(BCQ_TOOLS_FILES + generate_bcq_output_arrays + preserve_bcq_info +) + +foreach(BCQ_TOOLS IN ITEMS ${BCQ_TOOLS_FILES}) + + set(BCQ_TOOLS_FILE ${BCQ_TOOLS}) + set(BCQ_TOOLS_SRC "${CMAKE_CURRENT_SOURCE_DIR}/${BCQ_TOOLS_FILE}") + set(BCQ_TOOLS_BIN "${CMAKE_CURRENT_BINARY_DIR}/${BCQ_TOOLS_FILE}") + set(BCQ_TOOLS_TARGET "${BCQ_TOOLS}_target") + + add_custom_command(OUTPUT ${BCQ_TOOLS_BIN} + COMMAND ${CMAKE_COMMAND} -E copy "${BCQ_TOOLS_SRC}" "${BCQ_TOOLS_BIN}" + DEPENDS ${BCQ_TOOLS_SRC} + COMMENT "Generate ${BCQ_TOOLS_BIN}" + ) + + add_custom_target(${BCQ_TOOLS_TARGET} ALL DEPENDS ${BCQ_TOOLS_BIN}) + + install(FILES ${BCQ_TOOLS_BIN} + PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE + GROUP_READ GROUP_WRITE GROUP_EXECUTE + WORLD_READ WORLD_EXECUTE + DESTINATION bin) + +endforeach(BCQ_TOOLS) diff --git a/compiler/bcq-tools/README.md b/compiler/bcq-tools/README.md new file mode 100644 index 0000000..18b0f48 --- /dev/null +++ b/compiler/bcq-tools/README.md @@ -0,0 +1,78 @@ +# BCQ Tools + +This directory includes some tools related with BCQ. + +## preserve_bcq_info + +### Purpose + +`preserve_bcq_info` is for preserving constant nodes which include BCQ information. +When `.pb` file is converted to `.tflite` file by TFlite converter, constant nodes whose values are exactly same are removed and then linked to only one representative node. +This makes us impossible to know what constant node should be linked to a node which we want to apply BCQ. +One of the solutions is making all the same constant nodes different by inserting unique values and ignore the newly generated unique values when BCQ fusing is applied. +`preserve_bcq_info` will generate and insert unique dummy values to the constant nodes whose values are same not to be removed by Tensorflow Lite converter. +As a result, BCQ information will be preserved. + +### How to use + +```bash +preserve_bcq_info \ +--input_path /path/to/original_model.pb \ +--output_path /path/to/preserved_model.pb +``` + +### How it works + +If we add unique dummy value at the end of each constant nodes, all the constant nodes would be different. Following is an example. + +``` +[Original Constant Nodes] +const(value=[1, 2, 3], name='const1') +const(value=[1, 2, 3], name='const2') +const(value=[1, 2, 3], name='const3') + +[After BCQ information preserved] +const(value=[1, 2, 3, -1], name='const1') +const(value=[1, 2, 3, -2], name='const2') +const(value=[1, 2, 3, -3], name='const3') +``` + +For dummy values, negative values are used instead of positive values. +This is because positive valus may be confused with original constant node values. +For your information, unique dummy value starts from -1 and moves to -2, -3, ..., -N, where N is the number of preserved constant nodes. + +### Caution + +- Newly generated dummy values should be ignored when the constant nodes are used. + +## generate_bcq_output_arrays + +### Purpose + +To apply BCQ, BCQ information nodes should be designated as model output so that they are alive even after TFLite conversion is finished. +However, there are so many nodes to designate and sometimes we cannot copy and paste all of them because the string size is too big. +`generate_bcq_output_arrays` is for generating output_arrays, which include BCQ information nodes. + +### How to use + +```bash +generate_bcq_output_arrays \ +--input_path /path/to/original_model.pb \ +--output_path /path/to/output_arrays.txt +``` + +### How it works + +``` +[Original BCQ information nodes] +const(value=[1, 2, 3, -1], name='const1') +const(value=[1, 2, 3, -2], name='const2') +const(value=[1, 2, 3, -3], name='const3') + +[Generated output_arrays] +,const1,const2,const3 +``` + +### Caution + +- Generated output_arrays will be start with comma. diff --git a/compiler/bcq-tools/generate_bcq_output_arrays b/compiler/bcq-tools/generate_bcq_output_arrays new file mode 100644 index 0000000..48e8a93 --- /dev/null +++ b/compiler/bcq-tools/generate_bcq_output_arrays @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 + +import tensorflow as tf + +import argparse +import sys + + +def _get_parser(): + """ + Returns an ArgumentParser for generating output_arrays. + """ + parser = argparse.ArgumentParser( + description=("Command line tool to generated output_arrays of BCQ nodes")) + + # Input and output path. + parser.add_argument( + "-i", + "--input_path", + type=str, + help="Full filepath of the input file.", + required=True) + parser.add_argument( + "-o", + "--output_path", + type=str, + help="Full filepath of the output file.", + required=True) + + return parser + + +def load_graph(frozen_graph_filename): + """ + Load graph from frozen pb file + """ + with tf.compat.v1.gfile.GFile(frozen_graph_filename, "rb") as f: + graph_def = tf.compat.v1.GraphDef() + graph_def.ParseFromString(f.read()) + with tf.Graph().as_default() as graph: + tf.import_graph_def(graph_def, name='') + return graph + + +def dtype2str(dtype): + if dtype == "int32": + return "TF_INT32" + elif dtype == "int64": + return "TF_INT64" + elif dtype == "float32": + return "TF_FLOAT" + elif dtype == "bool": + return "TF_BOOL" + else: + raise Exception("Not supported dtype") + + +def print_output_arrays(flags): + graph_model = load_graph(flags.input_path) + graph_model_def = graph_model.as_graph_def() + ops = graph_model.get_operations() + + output_names = [op.outputs[0].name for op in ops + if op.type == "Const" and "bcqinfo_" in op.outputs[0].name] + + output_arrays = "" + for output_name in output_names: + output_arrays += "," + + colon_index = output_name.find(":") + if colon_index == -1: + output_arrays += output_name + else: + output_arrays += output_name[:colon_index] + + f = open(flags.output_path, 'w') + f.write(output_arrays) + f.close() + + +def main(): + # Parse argument. + parser = _get_parser() + flags = parser.parse_known_args(args=sys.argv[1:]) + + print_output_arrays(flags[0]) + + +if __name__ == "__main__": + main() diff --git a/compiler/bcq-tools/preserve_bcq_info b/compiler/bcq-tools/preserve_bcq_info new file mode 100644 index 0000000..2ede8d4 --- /dev/null +++ b/compiler/bcq-tools/preserve_bcq_info @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 + +import tensorflow as tf +import numpy as np + +import argparse +import sys + + +def _get_parser(): + """ + Returns an ArgumentParser for preserving BCQ information. + """ + parser = argparse.ArgumentParser( + description=("Command line tool to preserve BCQ information")) + + # Input and output path. + parser.add_argument( + "-i", + "--input_path", + type=str, + help="Full filepath of the input file.", + required=True) + parser.add_argument( + "-o", + "--output_path", + type=str, + help="Full filepath of the output file.", + required=True) + + return parser + + +def load_graph(frozen_graph_filename): + """ + Load graph from frozen pb file + """ + with tf.compat.v1.gfile.GFile(frozen_graph_filename, "rb") as f: + graph_def = tf.compat.v1.GraphDef() + graph_def.ParseFromString(f.read()) + with tf.Graph().as_default() as graph: + tf.import_graph_def(graph_def, name='') + return graph + + +def preserve_bcq_info(flags): + """ + Generate unique dummy value from -1 to -N. + + We use negative values to preserve BCQ information because + positive values may cause some confusion with real BCQ information values. + """ + + class UniqueValueGen: + def __init__(self): + self.unique_value = -1 + + def gen(self): + val = self.unique_value + self.unique_value = val - 1 + return val + + unique_value = UniqueValueGen() + + original_graph_model = load_graph(flags.input_path) + original_graph_model_def = original_graph_model.as_graph_def() + + new_graph = tf.compat.v1.GraphDef() + substitution_dict = {} + + DT_INT32 = None # Just for copying DT_INT32 attribute value + + for node in original_graph_model_def.node: + if node.op == "Const": + # Because bcqinfo_do_w_x is BOOL type, we cannot add dummy value at the end. + # Therefore we should convert the type to INT32 type. + if "/bcqinfo_do_w_x" in node.name: + original_tensor = tf.make_ndarray(node.attr["value"].tensor) + substitution_dict[node.name] = tf.make_tensor_proto( + [int(original_tensor[0]), unique_value.gen()], tf.int32) + + preserved_bcqinfo_list = ["/bcqinfo_number_of_clusters", "/bcqinfo_size_of_clusters", + "/bcqinfo_qbits_of_clusters"] + + if any(name in node.name for name in preserved_bcqinfo_list): + original_tensor = tf.make_ndarray( + node.attr["value"].tensor) # variable name change + substitution_dict[node.name] = tf.make_tensor_proto( + np.append(original_tensor, unique_value.gen()), tf.int32) + DT_INT32 = node.attr["dtype"] + + for node in original_graph_model_def.node: + if node.name in substitution_dict: + new_node = new_graph.node.add() + new_node.op = "Const" + new_node.name = node.name + new_node.attr["dtype"].CopyFrom(DT_INT32) + new_node.attr["value"].tensor.CopyFrom(substitution_dict[node.name]) + else: + new_node = new_graph.node.add() + new_node.CopyFrom(node) + + tf.io.write_graph(new_graph, '.', flags.output_path, False) + + +def main(): + # Parse argument. + parser = _get_parser() + flags = parser.parse_known_args(args=sys.argv[1:]) + + # Generate a new pb file, which BCQ information is preserved. + preserve_bcq_info(flags[0]) + + +if __name__ == "__main__": + main() diff --git a/compiler/circle-quantizer/CMakeLists.txt b/compiler/circle-quantizer/CMakeLists.txt index 1335057..009bfab 100644 --- a/compiler/circle-quantizer/CMakeLists.txt +++ b/compiler/circle-quantizer/CMakeLists.txt @@ -13,5 +13,6 @@ target_link_libraries(circle-quantizer luci_service) target_link_libraries(circle-quantizer luci_pass) target_link_libraries(circle-quantizer luci_export) target_link_libraries(circle-quantizer arser) +target_link_libraries(circle-quantizer vconone) install(TARGETS circle-quantizer DESTINATION bin) diff --git a/compiler/circle-quantizer/requires.cmake b/compiler/circle-quantizer/requires.cmake index 2293e53..c21e28e 100644 --- a/compiler/circle-quantizer/requires.cmake +++ b/compiler/circle-quantizer/requires.cmake @@ -5,3 +5,4 @@ require("safemain") require("luci") require("oops") require("arser") +require("vconone") diff --git a/compiler/circle-quantizer/src/CircleQuantizer.cpp b/compiler/circle-quantizer/src/CircleQuantizer.cpp index b56b547..8d3a80c 100644 --- a/compiler/circle-quantizer/src/CircleQuantizer.cpp +++ b/compiler/circle-quantizer/src/CircleQuantizer.cpp @@ -25,6 +25,7 @@ #include #include +#include #include #include @@ -36,6 +37,12 @@ using OptionHook = std::function; using Algorithms = luci::CircleOptimizer::Options::Algorithm; using AlgorithmParameters = luci::CircleOptimizer::Options::AlgorithmParameters; +void print_version(void) +{ + std::cout << "circle-quantizer version " << vconone::get_string() << std::endl; + std::cout << vconone::get_copyright() << std::endl; +} + int entry(int argc, char **argv) { // Simple argument parser (based on map) @@ -49,13 +56,20 @@ int entry(int argc, char **argv) arser::Arser arser("circle-quantizer provides circle model quantization"); + arser.add_argument("--version") + .nargs(0) + .required(false) + .default_value(false) + .help("Show version information and exit") + .exit_with(print_version); + arser.add_argument(qdqw) .nargs(3) .type(arser::DataType::STR_VEC) .required(false) .help("Quantize-dequantize weight values required action before quantization. " "Three arguments required: input_dtype(float32) " - "output_dtype(uint8) granularity(layer)"); + "output_dtype(uint8) granularity(layer, channel)"); arser.add_argument(qwmm) .nargs(3) @@ -63,7 +77,7 @@ int entry(int argc, char **argv) .required(false) .help("Quantize with min/max values. " "Three arguments required: input_dtype(float32) " - "output_dtype(uint8) granularity(layer)"); + "output_dtype(uint8) granularity(layer, channel)"); arser.add_argument("input").nargs(1).type(arser::DataType::STR).help("Input circle model"); arser.add_argument("output").nargs(1).type(arser::DataType::STR).help("Output circle model"); diff --git a/compiler/circle-tensordump/driver/Driver.cpp b/compiler/circle-tensordump/driver/Driver.cpp index a55cd45..38e3073 100644 --- a/compiler/circle-tensordump/driver/Driver.cpp +++ b/compiler/circle-tensordump/driver/Driver.cpp @@ -46,7 +46,7 @@ int entry(int argc, char **argv) { std::cout << err.what() << std::endl; std::cout << arser; - return 0; + return 255; } std::unique_ptr dump; diff --git a/compiler/circle-tensordump/src/Dump.cpp b/compiler/circle-tensordump/src/Dump.cpp index dfa78f0..a8d3256 100644 --- a/compiler/circle-tensordump/src/Dump.cpp +++ b/compiler/circle-tensordump/src/Dump.cpp @@ -136,6 +136,7 @@ void DumpTensors::run(std::ostream &os, const circle::Model *model, const std::s auto max = quant_param->max(); auto scale = quant_param->scale(); auto zero_point = quant_param->zero_point(); + auto quantized_dimension = quant_param->quantized_dimension(); os << " " + print_format2 + "   ├── min : "; ::print_comma_sepearted(os, min); @@ -146,9 +147,11 @@ void DumpTensors::run(std::ostream &os, const circle::Model *model, const std::s os << " " + print_format2 + "   ├── scale : "; ::print_comma_sepearted(os, scale); os << std::endl; - os << " " + print_format2 + "   └── zero_point : "; + os << " " + print_format2 + "   ├── zero_point : "; ::print_comma_sepearted(os, zero_point); os << std::endl; + os << " " + print_format2 + "   └── quantized_dimension : " << quantized_dimension; + os << std::endl; } // buffer @@ -229,7 +232,7 @@ std::vector hdf5_dims_cast(const flatbuffers::Vector *data, } /** - * This function writes data to given hdf5 file like below. + * This function writes vector data to given hdf5 file like below. * * GROUP "group_name" * ㄴDATATYPE "type" @@ -238,9 +241,9 @@ std::vector hdf5_dims_cast(const flatbuffers::Vector *data, * ㄴDATA "data" */ template -void write_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string dataset_name, - const H5::PredType &type, const flatbuffers::Vector *data, - std::vector dims) +void write_vector_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string dataset_name, + const H5::PredType &type, const flatbuffers::Vector *data, + std::vector dims) { if (data == nullptr) return; @@ -250,6 +253,17 @@ void write_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string d dataset->write(data->data(), type); } +/// @brief This function writes scalar data to given hdf5 file +template +void write_scalar_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string dataset_name, + const H5::PredType &type, T data) +{ + auto dataspace = std::make_unique(H5S_SCALAR); + auto dataset = std::make_unique( + file.createDataSet(group_name + "/" + dataset_name, type, *dataspace)); + dataset->write(&data, type); +} + } // namespace namespace circletensordump @@ -297,8 +311,9 @@ void DumpTensorsToHdf5::run(std::ostream &os, const circle::Model *model, auto buff_data_ptr = reader.buffers()->Get(buff_idx)->data(); if (buff_data_ptr) { - ::write_data_to_hdf5(file, group_name, "weights", ::hdf5_dtype_cast(tensor->type()), - buff_data_ptr, ::hdf5_dims_cast(buff_data_ptr, tensor->shape())); + ::write_vector_data_to_hdf5(file, group_name, "weights", ::hdf5_dtype_cast(tensor->type()), + buff_data_ptr, + ::hdf5_dims_cast(buff_data_ptr, tensor->shape())); } // write quantization parameters @@ -306,17 +321,20 @@ void DumpTensorsToHdf5::run(std::ostream &os, const circle::Model *model, if (quant_param) { auto min = quant_param->min(); - ::write_data_to_hdf5(file, group_name, "min", H5::PredType::NATIVE_FLOAT, min, - ::hdf5_dims_cast(min)); + ::write_vector_data_to_hdf5(file, group_name, "min", H5::PredType::NATIVE_FLOAT, min, + ::hdf5_dims_cast(min)); auto max = quant_param->max(); - ::write_data_to_hdf5(file, group_name, "max", H5::PredType::NATIVE_FLOAT, max, - ::hdf5_dims_cast(max)); + ::write_vector_data_to_hdf5(file, group_name, "max", H5::PredType::NATIVE_FLOAT, max, + ::hdf5_dims_cast(max)); auto scale = quant_param->scale(); - ::write_data_to_hdf5(file, group_name, "scale", H5::PredType::NATIVE_FLOAT, scale, - ::hdf5_dims_cast(scale)); + ::write_vector_data_to_hdf5(file, group_name, "scale", H5::PredType::NATIVE_FLOAT, scale, + ::hdf5_dims_cast(scale)); auto zero_point = quant_param->zero_point(); - ::write_data_to_hdf5(file, group_name, "zero_point", H5::PredType::NATIVE_INT64, zero_point, - ::hdf5_dims_cast(zero_point)); + ::write_vector_data_to_hdf5(file, group_name, "zero_point", H5::PredType::NATIVE_INT64, + zero_point, ::hdf5_dims_cast(zero_point)); + auto quantized_dimension = quant_param->quantized_dimension(); + ::write_scalar_data_to_hdf5(file, group_name, "quantized_dimension", + H5::PredType::NATIVE_INT32, quantized_dimension); } } } diff --git a/compiler/circle-verify/src/Driver.cpp b/compiler/circle-verify/src/Driver.cpp index 1af31d9..7a44c65 100644 --- a/compiler/circle-verify/src/Driver.cpp +++ b/compiler/circle-verify/src/Driver.cpp @@ -35,7 +35,7 @@ int entry(int argc, char **argv) { std::cout << err.what() << std::endl; std::cout << arser; - return 0; + return 255; } auto verifier = std::make_unique(); diff --git a/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt b/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt index 6663cb9..4bcaae3 100644 --- a/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt +++ b/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt @@ -1,25 +1,12 @@ nnas_include(TargetRequire) unset(REQUIRED_TARGETS) -list(APPEND REQUIRED_TARGETS circlechef) list(APPEND REQUIRED_TARGETS circle-inspect) list(APPEND REQUIRED_TARGETS circle-verify) list(APPEND REQUIRED_TARGETS circle2circle) list(APPEND REQUIRED_TARGETS dredd_rule_lib) -list(APPEND REQUIRED_TARGETS tflchef) -list(APPEND REQUIRED_TARGETS tflite2circle) TargetRequire_Return(${REQUIRED_TARGETS}) -nncc_find_resource(TensorFlowLiteRecipes) -nncc_find_resource(CircleRecipes) - -set(TFLITE_RECIPE_REPO "${TensorFlowLiteRecipes_DIR}") -set(CIRCLE_RECIPE_REPO "${CircleRecipes_DIR}") -unset(RECIPE_REPO) - -set(TEST_RECIPE_FILENAME "test.recipe") -set(TEST_RULE_FILENAME "test.rule") - unset(TEST_DEPS) unset(TEST_NAMES) @@ -27,21 +14,9 @@ set(options "") set(oneValueArgs "") set(multiValueArgs PASS) -macro(Add RECIPE) - if(NOT EXISTS "${TFLITE_RECIPE_REPO}/${RECIPE}/test.recipe") - if(NOT EXISTS "${CIRCLE_RECIPE_REPO}/${RECIPE}/test.recipe") - message(FATAL_ERROR "Missing recipe of '${RECIPE}' test") - else() - set(RECIPE_REPO ${CIRCLE_RECIPE_REPO}) - endif() - else() - set(RECIPE_REPO ${TFLITE_RECIPE_REPO}) - endif() - - if(NOT EXISTS "${RECIPE_REPO}/${RECIPE}/test.rule") - message(FATAL_ERROR "Missing rule of '${RECIPE}' test") - endif() +get_target_property(ARTIFACTS_BIN_PATH testDataGenerator BINARY_DIR) +macro(Add RECIPE) cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) unset(OPT_OPTIONS) foreach(src ${ARG_PASS}) @@ -49,71 +24,20 @@ macro(Add RECIPE) list(APPEND OPT_OPTIONS "--${src}") endforeach(src ${ARG_PASS}) - set(RECIPE_FILE "${RECIPE}.recipe") - set(RECIPE_SOURCE_PATH "${RECIPE_REPO}/${RECIPE}/${TEST_RECIPE_FILENAME}") - set(RECIPE_BINARY_PATH "${CMAKE_CURRENT_BINARY_DIR}/${RECIPE_FILE}") - - set(RULE_FILE "${RECIPE}.rule") - set(RULE_SOURCE_PATH "${RECIPE_REPO}/${RECIPE}/${TEST_RULE_FILENAME}") - set(RULE_BINARY_PATH "${CMAKE_CURRENT_BINARY_DIR}/${RULE_FILE}") - - set(TFLITE_FILE "${RECIPE}.tflite") - set(TFLITE_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/${TFLITE_FILE}") - set(CIRCLE_FILE "${RECIPE}.circle") - set(CIRCLE_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/${CIRCLE_FILE}") + set(CIRCLE_PATH "${ARTIFACTS_BIN_PATH}/${CIRCLE_FILE}") set(OPT_CIRCLE_FILE "${RECIPE}.opt.circle") set(OPT_CIRCLE_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/${OPT_CIRCLE_FILE}") - # Copy .recipe - add_custom_command(OUTPUT ${RECIPE_BINARY_PATH} - COMMAND ${CMAKE_COMMAND} -E copy "${RECIPE_SOURCE_PATH}" "${RECIPE_BINARY_PATH}" - DEPENDS ${RECIPE_SOURCE_PATH} - COMMENT "Generate ${RECIPE_FILE}" - ) - - # Copy .rule - add_custom_command(OUTPUT ${RULE_BINARY_PATH} - COMMAND ${CMAKE_COMMAND} -E copy "${RULE_SOURCE_PATH}" "${RULE_BINARY_PATH}" - DEPENDS ${RULE_SOURCE_PATH} - COMMENT "Generate ${RULE_FILE}" - ) - - if(${RECIPE_REPO} STREQUAL ${TFLITE_RECIPE_REPO}) - # Generate .tflite - add_custom_command(OUTPUT ${TFLITE_OUTPUT_PATH} - COMMAND $ ${RECIPE_BINARY_PATH} ${TFLITE_OUTPUT_PATH} - DEPENDS $ ${RECIPE_BINARY_PATH} - COMMENT "Generate ${TFLITE_FILE}" - ) - - # Generate .circle - add_custom_command(OUTPUT ${CIRCLE_OUTPUT_PATH} - COMMAND $ ${TFLITE_OUTPUT_PATH} ${CIRCLE_OUTPUT_PATH} - DEPENDS $ ${TFLITE_OUTPUT_PATH} - COMMENT "Generate ${CIRCLE_FILE}" - ) - - list(APPEND TEST_DEPS ${TFLITE_OUTPUT_PATH}) - else() - # Generate .circle - add_custom_command(OUTPUT ${CIRCLE_OUTPUT_PATH} - COMMAND $ ${RECIPE_BINARY_PATH} ${CIRCLE_OUTPUT_PATH} - DEPENDS $ ${RECIPE_BINARY_PATH} - COMMENT "Generate ${CIRCLE_FILE}" - ) - endif() - # Generate optimized .circle add_custom_command(OUTPUT ${OPT_CIRCLE_OUTPUT_PATH} - COMMAND $ ${OPT_OPTIONS} ${CIRCLE_OUTPUT_PATH} ${OPT_CIRCLE_OUTPUT_PATH} - DEPENDS $ ${CIRCLE_OUTPUT_PATH} + COMMAND $ ${OPT_OPTIONS} ${CIRCLE_PATH} ${OPT_CIRCLE_OUTPUT_PATH} + DEPENDS $ ${CIRCLE_PATH} COMMENT "Generate ${OPT_CIRCLE_FILE}" ) - list(APPEND TEST_DEPS ${RECIPE_BINARY_PATH} ${RULE_BINARY_PATH} - ${CIRCLE_OUTPUT_PATH} ${OPT_CIRCLE_OUTPUT_PATH}) + list(APPEND TEST_DEPS ${OPT_CIRCLE_OUTPUT_PATH}) list(APPEND TEST_NAMES ${RECIPE}) endmacro(Add) @@ -174,12 +98,15 @@ list(APPEND TEST_DEPS "${RULE_LIB_BINARY_PATH}") # Generate dependencies add_custom_target(circle2circle_dredd_recipe_test ALL DEPENDS ${TEST_DEPS}) +add_dependencies(circle2circle_dredd_recipe_test common_artifacts_deps) + +get_target_property(ARTIFACTS_BIN_PATH testDataGenerator BINARY_DIR) # Run tests add_test( NAME circle2circle_dredd_recipe_test COMMAND "${TEST_RUNNER}" "${TEST_CONFIG}" - "${CMAKE_CURRENT_BINARY_DIR}" + "${ARTIFACTS_BIN_PATH}" ${TEST_NAMES} ) diff --git a/compiler/circle2circle-dredd-recipe-test/requires.cmake b/compiler/circle2circle-dredd-recipe-test/requires.cmake index e4a5b71..70e7c52 100644 --- a/compiler/circle2circle-dredd-recipe-test/requires.cmake +++ b/compiler/circle2circle-dredd-recipe-test/requires.cmake @@ -1,7 +1,5 @@ -require("circlechef") require("circle2circle") require("circle-inspect") require("circle-verify") +require("common-artifacts") require("dredd-rule-lib") -require("tflchef") -require("tflite2circle") diff --git a/compiler/circle2circle-dredd-recipe-test/test.lst b/compiler/circle2circle-dredd-recipe-test/test.lst index 202f669..6328a64 100644 --- a/compiler/circle2circle-dredd-recipe-test/test.lst +++ b/compiler/circle2circle-dredd-recipe-test/test.lst @@ -11,9 +11,10 @@ ## TFLITE RECIPE Add(Net_InstanceNorm_001 PASS fuse_instnorm) -# Add(Net_InstanceNorm_002 PASS fuse_instnorm) +Add(Net_InstanceNorm_002 PASS fuse_instnorm) Add(BatchMatMulV2_000 PASS resolve_customop_batchmatmul) Add(MatMul_000 PASS resolve_customop_matmul) +Add(DepthwiseConv2D_003 PASS) ## CIRCLE RECIPE diff --git a/compiler/circle2circle-dredd-recipe-test/testall.sh b/compiler/circle2circle-dredd-recipe-test/testall.sh index 33a2036..2899587 100755 --- a/compiler/circle2circle-dredd-recipe-test/testall.sh +++ b/compiler/circle2circle-dredd-recipe-test/testall.sh @@ -13,21 +13,22 @@ if [[ $# -lt 2 ]]; then exit 255 fi +WORKDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" CONFIG_PATH="$1"; shift -WORKDIR="$1"; shift +RESOURCE_DIR="$1"; shift source "${CONFIG_PATH}" echo "-- Found circle-inspect: ${CIRCLE_INSPECT_PATH}" echo "-- Found circle-verify: ${CIRCLE_VERIFY_PATH}" echo "-- Found circle2circle: ${CIRCLE2CIRCLE_PATH}" -echo "-- Found workdir: ${WORKDIR}" +echo "-- Found common-artifacts: ${RESOURCE_DIR}" TESTED=() PASSED=() FAILED=() -pushd "${WORKDIR}" +pushd ${WORKDIR} while [[ $# -ne 0 ]]; do PREFIX="$1"; shift @@ -40,7 +41,7 @@ while [[ $# -ne 0 ]]; do cat > "${PREFIX}.log" <( exec 2>&1 - echo "-- Found tflite: ${PREFIX}.tflite" + echo "-- Found circle: ${PREFIX}.opt.circle" # Exit immediately if any command fails set -e @@ -55,7 +56,7 @@ while [[ $# -ne 0 ]]; do set +x # (COMPILED_FILE, INSPECT_PROG_PATH, VERIFY_PROG_PATH, ERROR_LOG) must be set for rule-lib.sh - COMPILED_FILE="${WORKDIR}/${PREFIX}.opt.circle" + COMPILED_FILE="${PREFIX}.opt.circle" INSPECT_PROG_PATH=${CIRCLE_INSPECT_PATH} VERIFY_PROG_PATH=${CIRCLE_VERIFY_PATH} ERROR_LOG="${PREFIX}.error" @@ -66,7 +67,7 @@ while [[ $# -ne 0 ]]; do trap 'echo "** ERROR **" ; cat "${ERROR_LOG}"' ERR source rule-lib.sh - source "${PREFIX}.rule" + source "${RESOURCE_DIR}/${PREFIX}.rule" # unset trap - ERR diff --git a/compiler/circle2circle/CMakeLists.txt b/compiler/circle2circle/CMakeLists.txt index 7b2bf9b..f60c896 100644 --- a/compiler/circle2circle/CMakeLists.txt +++ b/compiler/circle2circle/CMakeLists.txt @@ -19,6 +19,7 @@ target_link_libraries(circle2circle luci_service) target_link_libraries(circle2circle luci_pass) target_link_libraries(circle2circle luci_export) target_link_libraries(circle2circle arser) +target_link_libraries(circle2circle vconone) install(TARGETS circle2circle DESTINATION bin) @@ -44,3 +45,4 @@ target_link_libraries(circle2circle_test luci_service) target_link_libraries(circle2circle_test luci_pass) target_link_libraries(circle2circle_test luci_export) target_link_libraries(circle2circle_test arser) +target_link_libraries(circle2circle_test vconone) diff --git a/compiler/circle2circle/requires.cmake b/compiler/circle2circle/requires.cmake index 8cbb90d..36a9efd 100644 --- a/compiler/circle2circle/requires.cmake +++ b/compiler/circle2circle/requires.cmake @@ -9,3 +9,4 @@ require("hermes") require("hermes-std") require("luci") require("arser") +require("vconone") diff --git a/compiler/circle2circle/src/Circle2Circle.cpp b/compiler/circle2circle/src/Circle2Circle.cpp index 6888d26..849597b 100644 --- a/compiler/circle2circle/src/Circle2Circle.cpp +++ b/compiler/circle2circle/src/Circle2Circle.cpp @@ -26,6 +26,7 @@ #include #include +#include #include #include @@ -34,6 +35,12 @@ using Algorithms = luci::CircleOptimizer::Options::Algorithm; using AlgorithmParameters = luci::CircleOptimizer::Options::AlgorithmParameters; +void print_version(void) +{ + std::cout << "circle2circle version " << vconone::get_string() << std::endl; + std::cout << vconone::get_copyright() << std::endl; +} + int entry(int argc, char **argv) { // Simple argument parser (based on map) @@ -44,6 +51,13 @@ int entry(int argc, char **argv) arser::Arser arser("circle2circle provides circle model optimization and transformations"); + arser.add_argument("--version") + .nargs(0) + .required(false) + .default_value(false) + .help("Show version information and exit") + .exit_with(print_version); + arser.add_argument("--all").nargs(0).required(false).default_value(false).help( "Enable all optimize options"); diff --git a/compiler/circlechef/CMakeLists.txt b/compiler/circlechef/CMakeLists.txt index cba7d0a..3e2ddcb 100644 --- a/compiler/circlechef/CMakeLists.txt +++ b/compiler/circlechef/CMakeLists.txt @@ -18,4 +18,6 @@ add_subdirectory(core) add_subdirectory(circle) # Tools add_subdirectory(tools) -add_subdirectory(tests) +if(ENABLE_TEST) + add_subdirectory(tests) +endif(ENABLE_TEST) diff --git a/compiler/circlechef/circle/src/RecipeChef.cpp b/compiler/circlechef/circle/src/RecipeChef.cpp index 17ef1be..51326c7 100644 --- a/compiler/circlechef/circle/src/RecipeChef.cpp +++ b/compiler/circlechef/circle/src/RecipeChef.cpp @@ -181,6 +181,8 @@ std::unique_ptr generate_recipe(const circle::Model *model) for (uint32_t idx = 0; idx < quant->zero_point()->size(); ++idx) chef_quant->add_zero_point(quant->zero_point()->Get(idx)); } + circlechef::TensorQuantization *chef_quant = operand->mutable_quant(); + chef_quant->set_quantized_dimension(quant->quantized_dimension()); } } diff --git a/compiler/circlechef/core/src/ModelChef.cpp b/compiler/circlechef/core/src/ModelChef.cpp index 76aeacd..d81467d 100644 --- a/compiler/circlechef/core/src/ModelChef.cpp +++ b/compiler/circlechef/core/src/ModelChef.cpp @@ -413,6 +413,7 @@ template void cook_graph(const T &graph, CookParams &cp) quant_builder.add_min(quant_min); quant_builder.add_scale(quant_scale); quant_builder.add_zero_point(quant_zero_point); + quant_builder.add_quantized_dimension(quant.quantized_dimension()); // Update QuantizationParameters Index quant_index = quant_builder.Finish(); diff --git a/compiler/circlechef/proto/circlechef.proto b/compiler/circlechef/proto/circlechef.proto index b8c009b..3e5e6b1 100644 --- a/compiler/circlechef/proto/circlechef.proto +++ b/compiler/circlechef/proto/circlechef.proto @@ -35,6 +35,7 @@ message TensorQuantization { repeated float max = 2; repeated float scale = 3; repeated int64 zero_point = 4; + optional int32 quantized_dimension = 5 [default = 0]; } message Operand { diff --git a/compiler/circlechef/tools/file/Driver.cpp b/compiler/circlechef/tools/file/Driver.cpp index a15da40..bcc0c7a 100644 --- a/compiler/circlechef/tools/file/Driver.cpp +++ b/compiler/circlechef/tools/file/Driver.cpp @@ -41,7 +41,7 @@ int entry(int argc, char **argv) { std::cout << err.what() << std::endl; std::cout << arser; - return 0; + return 255; } int32_t model_version = 1; diff --git a/compiler/circlechef/tools/reverse/Driver.cpp b/compiler/circlechef/tools/reverse/Driver.cpp index 9c0b9ea..8a2b85f 100644 --- a/compiler/circlechef/tools/reverse/Driver.cpp +++ b/compiler/circlechef/tools/reverse/Driver.cpp @@ -38,7 +38,7 @@ int entry(int argc, char **argv) { std::cout << err.what() << std::endl; std::cout << arser; - return 0; + return 255; } std::string circle_path = arser.get("circle"); diff --git a/compiler/circledump/driver/Driver.cpp b/compiler/circledump/driver/Driver.cpp index b8f561f..657f24f 100644 --- a/compiler/circledump/driver/Driver.cpp +++ b/compiler/circledump/driver/Driver.cpp @@ -33,7 +33,7 @@ int entry(int argc, char **argv) { std::cout << err.what() << '\n'; std::cout << arser; - return 0; + return 255; } std::string circle_path = arser.get("circle"); diff --git a/compiler/circledump/src/OpPrinter.cpp b/compiler/circledump/src/OpPrinter.cpp index 2c03203..5aa5d51 100644 --- a/compiler/circledump/src/OpPrinter.cpp +++ b/compiler/circledump/src/OpPrinter.cpp @@ -593,6 +593,20 @@ public: } }; +class UniquePrinter : public OpPrinter +{ +public: + void options(const circle::Operator *op, std::ostream &os) const override + { + if (auto *params = op->builtin_options_as_UniqueOptions()) + { + os << " "; + os << "idx_out_type(" << EnumNameTensorType(params->idx_out_type()) << ") "; + os << std::endl; + } + } +}; + class WhilePrinter : public OpPrinter { public: @@ -744,6 +758,7 @@ OpPrinterRegistry::OpPrinterRegistry() _op_map[circle::BuiltinOperator_SUM] = make_unique(); _op_map[circle::BuiltinOperator_TRANSPOSE_CONV] = make_unique(); // There is no Option for TOPK_V2 + _op_map[circle::BuiltinOperator_UNIQUE] = make_unique(); _op_map[circle::BuiltinOperator_WHILE] = make_unique(); _op_map[circle::BuiltinOperator_CUSTOM] = make_unique(); diff --git a/compiler/common-artifacts/exclude.lst b/compiler/common-artifacts/exclude.lst index b614b71..d3f5601 100644 --- a/compiler/common-artifacts/exclude.lst +++ b/compiler/common-artifacts/exclude.lst @@ -5,9 +5,12 @@ #[[ optimize : Exclude from circle optimization(circle2circle) ]] ## TensorFlowLiteRecipes -optimize(ReLU6_000) -optimize(Where_000) -optimize(Where_001) +optimize(Unique_000) +optimize(Unique_001) +optimize(Unique_002) +optimize(Unique_003) +optimize(Unique_U8_000) +optimize(Unique_U8_001) ## CircleRecipes @@ -46,6 +49,7 @@ tcgenerate(DepthToSpace_000) tcgenerate(DepthwiseConv2D_001) # runtime doesn't support dilation tcgenerate(DepthwiseConv2D_003) # runtime doesn't support dilation tcgenerate(DepthwiseConv2D_U8_000) +tcgenerate(DepthwiseConv2D_U8_001) # luci-interpreter doesn't support channel-wise quantization yet tcgenerate(Div_000) tcgenerate(ELU_000) tcgenerate(Equal_000) @@ -96,7 +100,7 @@ tcgenerate(Neg_000) tcgenerate(Net_Dangle_001) tcgenerate(Net_InstanceNorm_001) tcgenerate(Net_InstanceNorm_002) -tcgenerate(Net_ZeroDim_001) # fix luci +tcgenerate(Net_ZeroDim_001) # luci-interpreter doesn't support zero dim tcgenerate(NotEqual_000) tcgenerate(OneHot_000) tcgenerate(OneHot_001) @@ -120,9 +124,9 @@ tcgenerate(ReduceProd_001) tcgenerate(ReduceProd_002) tcgenerate(ReduceProd_003) tcgenerate(ReLU_000) -tcgenerate(ReLU6_000) # luci NYI +tcgenerate(ReLU6_000) tcgenerate(ReLUN1To1_000) -tcgenerate(Reshape_003) # fix luci +tcgenerate(Reshape_003) # luci-interpreter doesn't support reshape without built-in option tcgenerate(Reshape_U8_000) tcgenerate(ResizeBilinear_000) tcgenerate(ResizeNearestNeighbor_000) @@ -148,7 +152,7 @@ tcgenerate(SpaceToBatchND_002) tcgenerate(SpaceToBatchND_003) tcgenerate(SpaceToDepth_000) tcgenerate(SparseToDense_000) -tcgenerate(SplitV_000) # fix luci +tcgenerate(SplitV_000) tcgenerate(Sqrt_000) tcgenerate(Square_000) tcgenerate(SquaredDifference_000) @@ -164,22 +168,21 @@ tcgenerate(Sum_001) tcgenerate(Tanh_000) tcgenerate(Tile_000) tcgenerate(Tile_U8_000) -tcgenerate(TopKV2_000) # fix luci -tcgenerate(TopKV2_001) # fix luci -tcgenerate(TransposeConv_000) # fix interpreter +tcgenerate(TopKV2_000) +tcgenerate(TopKV2_001) tcgenerate(Unique_000) tcgenerate(Unique_001) tcgenerate(Unique_002) tcgenerate(Unique_003) tcgenerate(Unique_U8_000) tcgenerate(Unique_U8_001) -tcgenerate(Where_000) # luci NYI -tcgenerate(Where_001) # luci NYI -tcgenerate(While_000) # fix luci +tcgenerate(Where_000) +tcgenerate(Where_001) +tcgenerate(While_000) tcgenerate(While_001) tcgenerate(While_002) tcgenerate(While_003) -tcgenerate(YUV_TO_RGB_000) # fix luci +tcgenerate(YUV_TO_RGB_000) tcgenerate(YUV_TO_RGB_U8_000) tcgenerate(ZerosLike_000) diff --git a/compiler/hermes/src/hermes.test.cpp b/compiler/hermes/src/hermes.test.cpp index 2cbc093..ea7ef65 100644 --- a/compiler/hermes/src/hermes.test.cpp +++ b/compiler/hermes/src/hermes.test.cpp @@ -18,7 +18,28 @@ #include -TEST(HermesTest, simple_usecase) +namespace { - // TO BE FILLED + +class Logger final : public hermes::Source +{ +public: + Logger(hermes::Context *ctx); + ~Logger(); +}; + +Logger::Logger(hermes::Context *ctx) { activate(ctx->sources(), ctx->bus()); } +Logger::~Logger() { deactivate(); } + +} // namespace + +TEST(HermesTest, logger_constructor_NEG) +{ + hermes::Context context; + // we expect segmentfault from nullptr->sources() + ASSERT_DEATH(Logger logger(&context), ""); + + SUCCEED(); } + +// TODO add HermesTest simple_usecase diff --git a/compiler/locomotiv/src/Node/BiasEncode.test.cpp b/compiler/locomotiv/src/Node/BiasEncode.test.cpp index cdb255c..4680f5c 100644 --- a/compiler/locomotiv/src/Node/BiasEncode.test.cpp +++ b/compiler/locomotiv/src/Node/BiasEncode.test.cpp @@ -90,6 +90,16 @@ template void test() } } // namespace -TEST(NodeExecution_BiasEncode, s32) { test(); } +TEST(NodeExecution_BiasEncode, s32) +{ + test(); + + SUCCEED(); +} -TEST(NodeExecution_BiasEncode, f32) { test(); } +TEST(NodeExecution_BiasEncode, f32) +{ + test(); + + SUCCEED(); +} diff --git a/compiler/locomotiv/src/Node/MatMul.test.cpp b/compiler/locomotiv/src/Node/MatMul.test.cpp index f1f3a52..7d942e1 100644 --- a/compiler/locomotiv/src/Node/MatMul.test.cpp +++ b/compiler/locomotiv/src/Node/MatMul.test.cpp @@ -142,6 +142,8 @@ TEST(NodeExecution_MatMul, f32_2x3_3x3) }; run_test(lhs, rhs, out, Shape{2, 3}, Shape{3, 3}, Shape{2, 3}, loco::DataType::FLOAT32); + + SUCCEED(); } /* from the code below: @@ -183,6 +185,8 @@ TEST(NodeExecution_MatMul, s32_4x2_2x6) }; run_test(lhs, rhs, out, Shape{4, 2}, Shape{2, 6}, Shape{4, 6}, loco::DataType::S32); + + SUCCEED(); } // clang-format on diff --git a/compiler/locop/src/FormattedGraph.test.cpp b/compiler/locop/src/FormattedGraph.test.cpp index c9808d3..aff9ebe 100644 --- a/compiler/locop/src/FormattedGraph.test.cpp +++ b/compiler/locop/src/FormattedGraph.test.cpp @@ -28,6 +28,8 @@ TEST(LinearV1FormatterTest, simple) // TODO Validate the output (when the implementation becomes stable) std::cout << locop::fmt(g) << std::endl; + + SUCCEED(); } TEST(LinearV1FormatterTest, user_defined_node_summary_builder) diff --git a/compiler/locop/src/FormattedTensorShape.test.cpp b/compiler/locop/src/FormattedTensorShape.test.cpp index 0f0017a..fc85df3 100644 --- a/compiler/locop/src/FormattedTensorShape.test.cpp +++ b/compiler/locop/src/FormattedTensorShape.test.cpp @@ -30,4 +30,6 @@ TEST(FormattedTensorShapeTest, BracketFormat) tensor_shape->dim(0) = 4; std::cout << fmt(tensor_shape.get()) << std::endl; + + SUCCEED(); } diff --git a/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h b/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h index 9987898..4ac3d86 100644 --- a/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h +++ b/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h @@ -79,12 +79,11 @@ private: // // Note that due to historical and performance reasons, per-tensor quantization uses unsigned // integer types, while per-channel uses signed types assuming 'zero_point' == 0. -// -// TODO Add 'quantized_dimension' field for per-channel case when IR provides it. struct AffineQuantization { std::vector scale; std::vector zero_point; + int32_t quantized_dimension; }; class Tensor @@ -108,6 +107,12 @@ public: return _quantization.zero_point[0]; } + const std::vector &scales() const { return _quantization.scale; } + + const std::vector &zero_points() const { return _quantization.zero_point; } + + int32_t quantized_dimension() const { return _quantization.quantized_dimension; } + template const T *data() const { return reinterpret_cast(_data.get()); } template T *data() { return reinterpret_cast(_data.get()); } diff --git a/compiler/luci-interpreter/src/core/KernelParams.h b/compiler/luci-interpreter/src/core/KernelParams.h index a32e0d4..65d1197 100644 --- a/compiler/luci-interpreter/src/core/KernelParams.h +++ b/compiler/luci-interpreter/src/core/KernelParams.h @@ -56,6 +56,11 @@ struct Conv2DParams Activation activation; }; +struct DepthToSpaceParams +{ + int block_size; +}; + struct DepthwiseConv2DParams { Padding padding; diff --git a/compiler/luci-interpreter/src/kernels/CMakeLists.txt b/compiler/luci-interpreter/src/kernels/CMakeLists.txt index fe36231..a1fd1de 100644 --- a/compiler/luci-interpreter/src/kernels/CMakeLists.txt +++ b/compiler/luci-interpreter/src/kernels/CMakeLists.txt @@ -12,6 +12,8 @@ set(SOURCES Concatenation.cpp Conv2D.h Conv2D.cpp + DepthToSpace.h + DepthToSpace.cpp DepthwiseConv2D.h DepthwiseConv2D.cpp Elu.h @@ -40,6 +42,10 @@ set(SOURCES Pad.cpp Reshape.h Reshape.cpp + Reverse.h + Reverse.cpp + Slice.h + Slice.cpp Softmax.h Softmax.cpp SpaceToDepth.h @@ -77,6 +83,7 @@ set(TEST_SOURCES AveragePool2D.test.cpp Concatenation.test.cpp Conv2D.test.cpp + DepthToSpace.test.cpp DepthwiseConv2D.test.cpp Elu.test.cpp FullyConnected.test.cpp @@ -91,6 +98,8 @@ set(TEST_SOURCES Mul.test.cpp Pad.test.cpp Reshape.test.cpp + Reverse.test.cpp + Slice.test.cpp Softmax.test.cpp SpaceToDepth.test.cpp Split.test.cpp diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp b/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp new file mode 100644 index 0000000..cab63e2 --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "DepthToSpace.h" +#include "Utils.h" +#include + +namespace luci_interpreter +{ +namespace kernels +{ + +DepthToSpace::DepthToSpace(const Tensor *input, Tensor *output, const DepthToSpaceParams ¶ms) + : KernelWithParams({input}, {output}, params) +{ +} + +void DepthToSpace::configure() +{ + if (input()->shape().num_dims() != 4) + { + throw std::runtime_error("Invalid input num_dims."); + } + if (output()->element_type() != DataType::FLOAT32 && output()->element_type() != DataType::U8 && + output()->element_type() != DataType::S8 && output()->element_type() != DataType::S32 && + output()->element_type() != DataType::S64) + { + throw std::runtime_error("Invalid output type"); + } + if (input()->element_type() != output()->element_type()) + { + throw std::runtime_error("Type mismatch on input and output."); + } + const int block_size = params().block_size; + const int32_t input_height = input()->shape().dim(1); + const int32_t input_width = input()->shape().dim(2); + const int32_t input_channels = input()->shape().dim(3); + int32_t output_height = input_height * block_size; + int32_t output_width = input_width * block_size; + int32_t output_channels = input_channels / block_size / block_size; + + assert(input_height == output_height / block_size); + assert(input_width == output_width / block_size); + assert(input_channels == output_channels * block_size * block_size); + + Shape output_shape(4); + output_shape.dim(0) = input()->shape().dim(0); + output_shape.dim(1) = output_height; + output_shape.dim(2) = output_width; + output_shape.dim(3) = output_channels; + + output()->resize(output_shape); +} + +void DepthToSpace::execute() const +{ + tflite::DepthToSpaceParams op_params; + op_params.block_size = params().block_size; + switch (input()->element_type()) + { + case DataType::FLOAT32: + tflite::optimized_ops::DepthToSpace(op_params, getTensorShape(input()), + getTensorData(input()), getTensorShape(output()), + getTensorData(output())); + break; + case DataType::U8: + tflite::optimized_ops::DepthToSpace(op_params, getTensorShape(input()), + getTensorData(input()), getTensorShape(output()), + getTensorData(output())); + break; + default: + throw std::runtime_error("Unsupported Type."); + } +} + +} // namespace kernels +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.h b/compiler/luci-interpreter/src/kernels/DepthToSpace.h new file mode 100644 index 0000000..63ce376 --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_KERNELS_DEPTHTOSPACE_H +#define LUCI_INTERPRETER_KERNELS_DEPTHTOSPACE_H + +#include "core/Kernel.h" +#include "core/KernelParams.h" + +#include + +namespace luci_interpreter +{ +namespace kernels +{ + +class DepthToSpace : public KernelWithParams +{ +public: + DepthToSpace(const Tensor *input, Tensor *output, const DepthToSpaceParams ¶ms); + + const Tensor *input() const { return _inputs[0]; } + Tensor *output() const { return _outputs[0]; } + + void configure() override; + void execute() const override; +}; + +} // namespace kernels +} // namespace luci_interpreter + +#endif // LUCI_INTERPRETER_KERNELS_DEPTHTOSPACE_H diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp b/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp new file mode 100644 index 0000000..1b80570 --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernels/DepthToSpace.h" +#include "kernels/TestUtils.h" + +namespace luci_interpreter +{ +namespace kernels +{ +namespace +{ + +using namespace testing; + +template class DepthToSpaceTest : public ::testing::Test +{ +}; + +using DataTypes = ::testing::Types; +TYPED_TEST_CASE(DepthToSpaceTest, DataTypes); + +TYPED_TEST(DepthToSpaceTest, SimpleCase) +{ + std::vector input_data{1, 2, 3, 4, 5, 6, 7, 8}; + Shape input_shape{1, 1, 2, 4}; + std::vector output_data{1, 2, 5, 6, 3, 4, 7, 8}; + std::vector output_shape{1, 2, 4, 1}; + + Tensor input_tensor = makeInputTensor()>(input_shape, input_data); + Tensor output_tensor = makeOutputTensor(getElementType()); + + DepthToSpaceParams params{}; + params.block_size = 2; + + DepthToSpace kernel = DepthToSpace(&input_tensor, &output_tensor, params); + kernel.configure(); + kernel.execute(); + + EXPECT_THAT(extractTensorData(output_tensor), + ::testing::ElementsAreArray(output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape)); +} + +} // namespace +} // namespace kernels +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp b/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp index fad450d..f53eaca 100644 --- a/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp +++ b/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp @@ -45,12 +45,9 @@ TEST(L2NormalizeTest, Float) ElementsAreArray(ArrayFloatNear(ref_output_data))); } -TEST(L2NormalizeTest, Uint8Quantized) -{ - // TODO - // Implement GetDequantizedOutput Function. - // Create Test for Uint8 Case -} +// TODO Uint8Quantized +// Implement GetDequantizedOutput Function. +// Create Test for Uint8 Case } // namespace } // namespace kernels diff --git a/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp b/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp index b0c06e7..c79d3d6 100644 --- a/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp +++ b/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp @@ -61,15 +61,14 @@ TEST(LeakReluTest, FloatSimple) 1.0f, -0.5f, -1.0f, // Row 2 }, /*alpha=*/0.5f, getElementType()); -} -TEST(LeakReluTest, Uint8Simple) -{ - // TODO - // Implement GetDequantizedOutput Function. - // Create Test for Uint8 Case + SUCCEED(); } +// TODO Uint8Simple +// Implement GetDequantizedOutput Function. +// Create Test for Uint8 Case + } // namespace } // namespace kernels } // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/Logistic.test.cpp b/compiler/luci-interpreter/src/kernels/Logistic.test.cpp index 17456a4..00feddf 100644 --- a/compiler/luci-interpreter/src/kernels/Logistic.test.cpp +++ b/compiler/luci-interpreter/src/kernels/Logistic.test.cpp @@ -49,10 +49,8 @@ TEST(LogisticTest, Float) // TODO make a Shape checking of output_tensor. } -TEST(LogisticTest, Uint8) -{ - // Need to Implement GetDequantizedOutput Function. -} +// TODO Uint8 +// Need to Implement GetDequantizedOutput Function. } // namespace } // namespace kernels diff --git a/compiler/luci-interpreter/src/kernels/Reverse.cpp b/compiler/luci-interpreter/src/kernels/Reverse.cpp new file mode 100644 index 0000000..a463084 --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/Reverse.cpp @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernels/Reverse.h" +#include "kernels/Utils.h" +#include + +namespace luci_interpreter +{ + +namespace kernels +{ + +Reverse::Reverse(const Tensor *input, const Tensor *axes, Tensor *output) + : Kernel({input, axes}, {output}) +{ +} + +void Reverse::configure() +{ + assert(axes()->shape().num_dims() == 1); + assert(input()->shape().num_dims() >= axes()->shape().num_elements()); + if (input()->element_type() != DataType::S32 && input()->element_type() != DataType::FLOAT32 && + input()->element_type() != DataType::U8 && input()->element_type() != DataType::S16 && + input()->element_type() != DataType::S64) + { + throw std::runtime_error("Unsupported input type."); + } + if (axes()->element_type() != DataType::S32) + { + throw std::runtime_error("Unsupported axes type."); + } + if (axes()->shape().num_elements() > 1) + { + throw std::runtime_error("Current implementation does not support more than 1 axis."); + } + int axis_value = getTensorData(axes())[0]; + if (axis_value < 0 || axis_value >= input()->shape().num_dims()) + { + throw std::runtime_error("Invalid axes value"); + } + assert(input()->element_type() == output()->element_type()); + + output()->resize(input()->shape()); +} + +void Reverse::execute() const +{ + int axis_value = getTensorData(axes())[0]; + switch (output()->element_type()) + { + case DataType::FLOAT32: + tflite::reference_ops::Reverse(axis_value, getTensorShape(input()), + getTensorData(input()), getTensorShape(output()), + getTensorData(output())); + break; + case DataType::U8: + tflite::reference_ops::Reverse( + axis_value, getTensorShape(input()), getTensorData(input()), + getTensorShape(output()), getTensorData(output())); + break; + default: + throw std::runtime_error("Unsupported output type"); + } +} + +} // namespace kernels +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/Reverse.h b/compiler/luci-interpreter/src/kernels/Reverse.h new file mode 100644 index 0000000..3489dae --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/Reverse.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_KERNELS_REVERSE_H +#define LUCI_INTERPRETER_KERNELS_REVERSE_H + +#include "core/Kernel.h" + +namespace luci_interpreter +{ +namespace kernels +{ + +class Reverse : public Kernel +{ +public: + Reverse(const Tensor *input, const Tensor *axes, Tensor *output); + + const Tensor *input() const { return _inputs[0]; } + const Tensor *axes() const { return _inputs[1]; } + Tensor *output() const { return _outputs[0]; } + + void configure() override; + void execute() const override; +}; + +} // namespace kernels +} // namespace luci_interpreter + +#endif // LUCI_INTERPRETER_KERNELS_REVERSE_H diff --git a/compiler/luci-interpreter/src/kernels/Reverse.test.cpp b/compiler/luci-interpreter/src/kernels/Reverse.test.cpp new file mode 100644 index 0000000..5475a8b --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/Reverse.test.cpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernels/Reverse.h" +#include "kernels/TestUtils.h" + +namespace luci_interpreter +{ +namespace kernels +{ +namespace +{ + +using namespace testing; + +template class ReverseTest : public ::testing::Test +{ +}; + +using DataTypes = ::testing::Types; +TYPED_TEST_CASE(ReverseTest, DataTypes); + +TYPED_TEST(ReverseTest, MultiDimensions) +{ + // TypeParam + std::vector input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24}; + Shape input_shape{4, 3, 2}; + std::vector axis_data{1}; + Shape axis_shape{1}; + + std::vector output_data{5, 6, 3, 4, 1, 2, 11, 12, 9, 10, 7, 8, + 17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20}; + std::vector output_shape{4, 3, 2}; + + Tensor input_tensor = makeInputTensor()>(input_shape, input_data); + Tensor axis_tensor = makeInputTensor(axis_shape, axis_data); + + Tensor output_tensor = makeOutputTensor(getElementType()); + + Reverse kernel = Reverse(&input_tensor, &axis_tensor, &output_tensor); + kernel.configure(); + kernel.execute(); + + EXPECT_THAT(extractTensorData(output_tensor), + ::testing::ElementsAreArray(output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape)); +} + +} // namespace +} // namespace kernels +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/Slice.cpp b/compiler/luci-interpreter/src/kernels/Slice.cpp new file mode 100644 index 0000000..c4bc3c5 --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/Slice.cpp @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernels/Slice.h" +#include "Utils.h" +#include + +#include +#include + +namespace luci_interpreter +{ + +namespace kernels +{ +const int max_dim = 4; + +Slice::Slice(const Tensor *input, const Tensor *begin, const Tensor *size, Tensor *output) + : Kernel({input, begin, size}, {output}) +{ +} + +template +Shape calculateOutputShape(const Tensor *input, const Tensor *begin, const Tensor *size) +{ + Shape output_shape = Shape(input->shape().num_dims()); + for (int idx = 0; idx < input->shape().num_dims(); idx++) + { + T size_value = getTensorData(size)[idx]; + if (size_value < 0) + { + if (size_value != -1) + { + throw std::runtime_error("Invalid size."); + } + size_value = input->shape().dim(idx) - getTensorData(begin)[idx]; + } + else + { + if (input->shape().dim(idx) < getTensorData(begin)[idx] + size_value) + { + throw std::runtime_error("Invalid begin and size."); + } + } + output_shape.dim(idx) = static_cast(size_value); + } + return output_shape; +} + +template +void getBeginAndSizeVectors(int dimensions, const Tensor *begin, const Tensor *size, + std::vector *begins, std::vector *sizes) +{ + for (int idx = dimensions - 1; idx >= 0; --idx) + { + begins->push_back(getTensorData(begin)[idx]); + sizes->push_back(getTensorData(size)[idx]); + } +} + +void Slice::configure() +{ + assert(input()->element_type() == output()->element_type()); + assert(begin()->element_type() == DataType::S32 || begin()->element_type() == DataType::S64); + assert(size()->element_type() == DataType::S32 || size()->element_type() == DataType::S64); + assert(begin()->shape().num_dims() == 1); + assert(size()->shape().num_dims() == 1); + assert(input()->shape().num_dims() <= max_dim); + + if (begin()->element_type() == DataType::S32) + { + output()->resize(calculateOutputShape(input(), begin(), size())); + } + else if (begin()->element_type() == DataType::S64) + { + output()->resize(calculateOutputShape(input(), begin(), size())); + } + else + { + throw std::runtime_error("Unsupported type."); + } +} + +void Slice::execute() const +{ + std::vector begins; + begins.reserve(max_dim); + std::vector sizes; + sizes.reserve(max_dim); + if (begin()->element_type() == DataType::S32) + { + getBeginAndSizeVectors(input()->shape().num_dims(), begin(), size(), &begins, &sizes); + } + else if (begin()->element_type() == DataType::S64) + { + getBeginAndSizeVectors(input()->shape().num_dims(), begin(), size(), &begins, &sizes); + } + else + { + throw std::runtime_error("Unsupported begin type."); + } + for (int i = input()->shape().num_dims(); i < max_dim; ++i) + { + begins.push_back(0); + sizes.push_back(1); + } + + assert(begins.size() == 4); + assert(sizes.size() == 4); + tflite::SliceParams op_params{}; + op_params.begin_count = 4; + op_params.size_count = 4; + for (int i = 0; i < 4; i++) + { + op_params.begin[i] = begins[3 - i]; + op_params.size[i] = sizes[3 - i]; + } + switch (input()->element_type()) + { + case DataType::FLOAT32: + tflite::optimized_ops::Slice(op_params, getTensorShape(input()), + getTensorData(input()), getTensorShape(output()), + getTensorData(output())); + break; + case DataType::U8: + tflite::optimized_ops::Slice(op_params, getTensorShape(input()), + getTensorData(input()), getTensorShape(output()), + getTensorData(output())); + break; + default: + throw std::runtime_error("Unsupported input type."); + } +} + +} // namespace kernels +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/Slice.h b/compiler/luci-interpreter/src/kernels/Slice.h new file mode 100644 index 0000000..23c3596 --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/Slice.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_KERNELS_SLICE_H +#define LUCI_INTERPRETER_KERNELS_SLICE_H + +#include "core/Kernel.h" + +namespace luci_interpreter +{ +namespace kernels +{ + +class Slice : public Kernel +{ +public: + Slice(const Tensor *input, const Tensor *begin, const Tensor *size, Tensor *output); + + const Tensor *input() const { return _inputs[0]; } + const Tensor *begin() const { return _inputs[1]; } + const Tensor *size() const { return _inputs[2]; } + Tensor *output() const { return _outputs[0]; } + + void configure() override; + void execute() const override; +}; + +} // namespace kernels +} // namespace luci_interpreter + +#endif // LUCI_INTERPRETER_KERNELS_SLICE_H diff --git a/compiler/luci-interpreter/src/kernels/Slice.test.cpp b/compiler/luci-interpreter/src/kernels/Slice.test.cpp new file mode 100644 index 0000000..a360a29 --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/Slice.test.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernels/Slice.h" +#include "kernels/TestUtils.h" + +namespace luci_interpreter +{ +namespace kernels +{ +namespace +{ + +using namespace testing; + +template class SliceTest : public ::testing::Test +{ +}; + +using DataTypes = ::testing::Types; +TYPED_TEST_CASE(SliceTest, DataTypes); + +TYPED_TEST(SliceTest, SimpleTest) +{ + std::vector input_data{1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6}; + Shape input_shape{3, 2, 3, 1}; + std::vector begin_data{1, 0, 0, 0}; + Shape begin_shape{4}; + std::vector size_data{2, 1, -1, 1}; + Shape size_shape{4}; + std::vector output_data{3, 3, 3, 5, 5, 5}; + std::vector output_shape{2, 1, 3, 1}; + + Tensor input_tensor = makeInputTensor()>(input_shape, input_data); + Tensor begin_tensor = makeInputTensor(begin_shape, begin_data); + Tensor size_tensor = makeInputTensor(size_shape, size_data); + + Tensor output_tensor = makeOutputTensor(getElementType()); + + Slice kernel(&input_tensor, &begin_tensor, &size_tensor, &output_tensor); + kernel.configure(); + kernel.execute(); + + EXPECT_THAT(extractTensorData(output_tensor), + ::testing::ElementsAreArray(output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape)); +} + +} // namespace +} // namespace kernels +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp b/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp index 3386d36..b8c0ac4 100644 --- a/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp +++ b/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp @@ -68,6 +68,8 @@ TEST(TransposeConvTest, FloatSimple) /*output_data=*/{29, 62, 83, 75, 99, 192, 237, 198, 207, 372, 417, 330, 263, 446, 485, 365}, /*params.padding=*/luci::Padding::SAME, /*stride_height=*/1, /*stride_width=*/1, getElementType()); + + SUCCEED(); } TEST(TransposeConvTest, FloatTwoFiltersTest) @@ -82,21 +84,18 @@ TEST(TransposeConvTest, FloatTwoFiltersTest) 3352, 3652, 2760}, /*params.padding=*/luci::Padding::SAME, /*stride_height=*/1, /*stride_width=*/1, getElementType()); -} -TEST(TransposeConvTest, Uint8Simple) -{ - // TODO - // Implement GetDequantizedOutput Function. - // Create Test for Uint8 Case -} -TEST(TransposeConvTest, Uint8FiltersTest) -{ - // TODO - // Implement GetDequantizedOutput Function. - // Create Test for Uint8 Case + SUCCEED(); } +// TODO Uint8Simple +// Implement GetDequantizedOutput Function. +// Create Test for Uint8 Case + +// TODO Uint8FiltersTest +// Implement GetDequantizedOutput Function. +// Create Test for Uint8 Case + } // namespace } // namespace kernels } // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/loader/CMakeLists.txt b/compiler/luci-interpreter/src/loader/CMakeLists.txt index fb36c4a..d99485d 100644 --- a/compiler/luci-interpreter/src/loader/CMakeLists.txt +++ b/compiler/luci-interpreter/src/loader/CMakeLists.txt @@ -1,3 +1,5 @@ +nnas_find_package(GTest REQUIRED) + set(SOURCES GraphLoader.h GraphLoader.cpp @@ -13,3 +15,8 @@ target_include_directories(luci_interpreter_loader PUBLIC "${LUCI_INTERPRETER_SO target_link_libraries(luci_interpreter_loader PUBLIC luci_lang luci_interpreter_core PRIVATE luci_interpreter_kernels nncc_common) + +set(TEST_SOURCES KernelBuilder.test.cpp) + +GTest_AddTest(luci_interpreter_loader_test ${TEST_SOURCES}) +target_link_libraries(luci_interpreter_loader_test luci_interpreter_loader) diff --git a/compiler/luci-interpreter/src/loader/GraphLoader.cpp b/compiler/luci-interpreter/src/loader/GraphLoader.cpp index 779fa06..6ebf979 100644 --- a/compiler/luci-interpreter/src/loader/GraphLoader.cpp +++ b/compiler/luci-interpreter/src/loader/GraphLoader.cpp @@ -16,7 +16,6 @@ #include "loader/GraphLoader.h" -#include "loader/ModuleLoader.h" #include "loader/KernelBuilder.h" #include @@ -71,6 +70,7 @@ bool isExecutableNode(const luci::CircleNode *node) { // These nodes denote inputs / outputs of a graph. case luci::CircleOpcode::CONST: + case luci::CircleOpcode::CIRCLECONST: case luci::CircleOpcode::CIRCLEINPUT: case luci::CircleOpcode::CIRCLEOUTPUT: // The following nodes denote outputs of multiple-output nodes. @@ -102,11 +102,12 @@ bool isTensorProducingNode(const luci::CircleNode *node) } // namespace -GraphLoader::GraphLoader(const ModuleLoader &module_loader, const loco::Graph *graph, - RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir, - std::unordered_map &node_to_tensor) - : _module_loader(module_loader), _graph(graph), _runtime_graph(runtime_graph), - _runtime_to_ir(runtime_to_ir), _node_to_tensor(node_to_tensor) +GraphLoader::GraphLoader( + const loco::Graph *graph, RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir, + const std::unordered_map &graph_to_runtime_graph, + std::unordered_map &node_to_tensor) + : _graph(graph), _runtime_graph(runtime_graph), _runtime_to_ir(runtime_to_ir), + _graph_to_runtime_graph(graph_to_runtime_graph), _node_to_tensor(node_to_tensor) { } @@ -136,6 +137,7 @@ void GraphLoader::loadTensors() const luci::CircleQuantParam *params = node->quantparam(); quantization.scale.assign(params->scale.cbegin(), params->scale.cend()); quantization.zero_point.assign(params->zerop.cbegin(), params->zerop.cend()); + quantization.quantized_dimension = params->quantized_dimension; } auto tensor = std::make_unique(node->dtype(), std::move(shape), std::move(quantization), @@ -178,7 +180,7 @@ void GraphLoader::initInputOutputTensors() const void GraphLoader::loadOperators() { - KernelBuilder kernel_builder(_module_loader, *this); + KernelBuilder kernel_builder(_graph_to_runtime_graph, _node_to_tensor); // Create kernels for executable nodes. This has to be done in execution order. for (const loco::Node *loco_node : @@ -195,11 +197,4 @@ void GraphLoader::loadOperators() } } -void GraphLoader::load() -{ - loadTensors(); - initInputOutputTensors(); - loadOperators(); -} - } // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/loader/GraphLoader.h b/compiler/luci-interpreter/src/loader/GraphLoader.h index e0adc0f..89c5bca 100644 --- a/compiler/luci-interpreter/src/loader/GraphLoader.h +++ b/compiler/luci-interpreter/src/loader/GraphLoader.h @@ -27,29 +27,23 @@ namespace luci_interpreter { -class ModuleLoader; - class GraphLoader { public: - GraphLoader(const ModuleLoader &module_loader, const loco::Graph *graph, - RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir, + GraphLoader(const loco::Graph *graph, RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir, + const std::unordered_map &graph_to_runtime_graph, std::unordered_map &node_to_tensor); - void load(); - - Tensor *getTensorForNode(const loco::Node *node) const { return _node_to_tensor.at(node); } - -private: - void loadOperators(); - void initInputOutputTensors() const; void loadTensors(); + void initInputOutputTensors() const; + void loadOperators(); - const ModuleLoader &_module_loader; +private: const loco::Graph *_graph; RuntimeGraph *_runtime_graph; RuntimeToIR &_runtime_to_ir; + const std::unordered_map &_graph_to_runtime_graph; std::unordered_map &_node_to_tensor; }; diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.cpp b/compiler/luci-interpreter/src/loader/KernelBuilder.cpp index 56da961..c19f897 100644 --- a/compiler/luci-interpreter/src/loader/KernelBuilder.cpp +++ b/compiler/luci-interpreter/src/loader/KernelBuilder.cpp @@ -21,6 +21,7 @@ #include "kernels/AveragePool2D.h" #include "kernels/Concatenation.h" #include "kernels/Conv2D.h" +#include "kernels/DepthToSpace.h" #include "kernels/DepthwiseConv2D.h" #include "kernels/Elu.h" #include "kernels/FullyConnected.h" @@ -35,6 +36,8 @@ #include "kernels/Mul.h" #include "kernels/Pad.h" #include "kernels/Reshape.h" +#include "kernels/Reverse.h" +#include "kernels/Slice.h" #include "kernels/Softmax.h" #include "kernels/SpaceToDepth.h" #include "kernels/Split.h" @@ -43,8 +46,6 @@ #include "kernels/Unpack.h" #include "kernels/Transpose.h" #include "kernels/TransposeConv.h" -#include "loader/GraphLoader.h" -#include "loader/ModuleLoader.h" #include @@ -68,7 +69,7 @@ static std::vector collectOutputNodes(const luci::CircleNode const Tensor *KernelBuilder::getInputTensor(const loco::Node *node) const { - const Tensor *tensor = _graph_loader.getTensorForNode(node); + const Tensor *tensor = _node_to_tensor.at(node); assert(tensor != nullptr); return tensor; } @@ -81,7 +82,7 @@ const Tensor *KernelBuilder::getOptionalInputTensor(const loco::Node *node) cons Tensor *KernelBuilder::getOutputTensor(const loco::Node *node) const { - Tensor *tensor = _graph_loader.getTensorForNode(node); + Tensor *tensor = _node_to_tensor.at(node); assert(tensor != nullptr); return tensor; } @@ -98,7 +99,7 @@ KernelBuilder::getOutputTensors(const std::vector &nodes) co RuntimeGraph *KernelBuilder::getRuntimeGraph(const loco::Graph *graph) const { - RuntimeGraph *runtime_graph = _module_loader.getRuntimeGraph(graph); + RuntimeGraph *runtime_graph = _graph_to_runtime_graph.at(graph); assert(runtime_graph != nullptr); return runtime_graph; } @@ -120,14 +121,14 @@ std::unique_ptr KernelBuilder::visit(const luci::CircleAdd *node) std::unique_ptr KernelBuilder::visit(const luci::CircleArgMax *node) { assert(node->arity() == 2); - const Tensor *input1 = getInputTensor(node->input()); - const Tensor *input2 = getInputTensor(node->dimension()); + const Tensor *input = getInputTensor(node->input()); + const Tensor *axis = getInputTensor(node->dimension()); Tensor *output = getOutputTensor(node); ArgMaxParams params{}; params.output_type = node->output_type(); - return std::make_unique(input1, input2, output, params); + return std::make_unique(input, axis, output, params); } std::unique_ptr KernelBuilder::visit(const luci::CircleAveragePool2D *node) @@ -188,6 +189,19 @@ std::unique_ptr KernelBuilder::visit(const luci::CircleConv2D *node) return std::make_unique(input, filter, bias, output, params); } +std::unique_ptr KernelBuilder::visit(const luci::CircleDepthToSpace *node) +{ + assert(node->arity() == 1); + + const Tensor *input = getInputTensor(node->input()); + Tensor *output = getOutputTensor(node); + + DepthToSpaceParams params{}; + params.block_size = node->block_size(); + + return std::make_unique(input, output, params); +} + std::unique_ptr KernelBuilder::visit(const luci::CircleDepthwiseConv2D *node) { assert(node->arity() == 3); @@ -224,14 +238,14 @@ std::unique_ptr KernelBuilder::visit(const luci::CircleFullyConnected *n assert(node->arity() == 3); const Tensor *input = getInputTensor(node->input()); - const Tensor *filter = getInputTensor(node->weights()); + const Tensor *weights = getInputTensor(node->weights()); const Tensor *bias = getOptionalInputTensor(node->bias()); Tensor *output = getOutputTensor(node); FullyConnectedParams params{}; params.activation = node->fusedActivationFunction(); - return std::make_unique(input, filter, bias, output, params); + return std::make_unique(input, weights, bias, output, params); } std::unique_ptr KernelBuilder::visit(const luci::CircleIf *node) @@ -255,6 +269,11 @@ std::unique_ptr KernelBuilder::visit(const luci::CircleIf *node) else_graph); } +std::unique_ptr KernelBuilder::visit(const luci::CircleInput *) +{ + throw std::runtime_error("Input node cannot be executed."); +} + std::unique_ptr KernelBuilder::visit(const luci::CircleL2Normalize *node) { assert(node->arity() == 1); @@ -323,11 +342,6 @@ std::unique_ptr KernelBuilder::visit(const luci::CircleLogistic *node) return std::make_unique(input, output); } -std::unique_ptr KernelBuilder::visit(const luci::CircleInput *) -{ - throw std::runtime_error("Input node cannot be executed."); -} - std::unique_ptr KernelBuilder::visit(const luci::CircleMaxPool2D *node) { assert(node->arity() == 1); @@ -402,6 +416,30 @@ std::unique_ptr KernelBuilder::visit(const luci::CircleReshape *node) return std::make_unique(input, shape, output); } +std::unique_ptr KernelBuilder::visit(const luci::CircleReverseV2 *node) +{ + assert(node->arity() == 2); + + const Tensor *input = getInputTensor(node->tensor()); + const Tensor *axes = getInputTensor(node->axis()); + Tensor *output = getOutputTensor(node); + + return std::make_unique(input, axes, output); +} + +std::unique_ptr KernelBuilder::visit(const luci::CircleSlice *node) +{ + assert(node->arity() == 3); + + const Tensor *input = getInputTensor(node->input()); + const Tensor *begin = getInputTensor(node->begin()); + const Tensor *size = getInputTensor(node->size()); + + Tensor *output = getOutputTensor(node); + + return std::make_unique(input, begin, size, output); +} + std::unique_ptr KernelBuilder::visit(const luci::CircleSoftmax *node) { assert(node->arity() == 1); @@ -442,6 +480,19 @@ std::unique_ptr KernelBuilder::visit(const luci::CircleSplit *node) return std::make_unique(axis, input, std::move(outputs)); } +std::unique_ptr KernelBuilder::visit(const luci::CircleSqueeze *node) +{ + assert(node->arity() == 1); + + const Tensor *input = getInputTensor(node->input()); + Tensor *output = getOutputTensor(node); + + SqueezeParams params{}; + params.squeeze_dims = node->squeeze_dims(); + + return std::make_unique(input, output, params); +} + std::unique_ptr KernelBuilder::visit(const luci::CircleStridedSlice *node) { assert(node->arity() == 4); @@ -463,21 +514,15 @@ std::unique_ptr KernelBuilder::visit(const luci::CircleStridedSlice *nod return std::make_unique(input, begin, end, strides, output, params); } -std::unique_ptr KernelBuilder::visit(const luci::CircleSqueeze *node) +std::unique_ptr KernelBuilder::visit(const luci::CircleTranspose *node) { - assert(node->arity() == 1); + assert(node->arity() == 2); - const Tensor *input = getInputTensor(node->input()); + const Tensor *input = getInputTensor(node->a()); + const Tensor *perm = getInputTensor(node->perm()); Tensor *output = getOutputTensor(node); - SqueezeParams params{}; - assert(node->squeeze_dims().size() <= 4); - for (size_t i = 0; i < node->squeeze_dims().size(); i++) - { - params.squeeze_dims.push_back(node->squeeze_dims().at(i)); - } - - return std::make_unique(input, output, params); + return std::make_unique(input, perm, output); } std::unique_ptr KernelBuilder::visit(const luci::CircleTransposeConv *node) @@ -515,15 +560,4 @@ std::unique_ptr KernelBuilder::visit(const luci::CircleUnpack *node) return std::make_unique(input, std::move(outputs), params); } -std::unique_ptr KernelBuilder::visit(const luci::CircleTranspose *node) -{ - assert(node->arity() == 2); - - const Tensor *input = getInputTensor(node->a()); - const Tensor *perm = getInputTensor(node->perm()); - Tensor *output = getOutputTensor(node); - - return std::make_unique(input, perm, output); -} - } // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.h b/compiler/luci-interpreter/src/loader/KernelBuilder.h index 7e30d39..d5c5a4b 100644 --- a/compiler/luci-interpreter/src/loader/KernelBuilder.h +++ b/compiler/luci-interpreter/src/loader/KernelBuilder.h @@ -24,18 +24,18 @@ #include #include +#include namespace luci_interpreter { -class GraphLoader; -class ModuleLoader; - class KernelBuilder : public luci::CircleNodeVisitor> { public: - KernelBuilder(const ModuleLoader &module_loader, const GraphLoader &graph_loader) - : _module_loader(module_loader), _graph_loader(graph_loader) + KernelBuilder( + const std::unordered_map &graph_to_runtime_graph, + const std::unordered_map &node_to_tensor) + : _graph_to_runtime_graph(graph_to_runtime_graph), _node_to_tensor(node_to_tensor) { } @@ -45,6 +45,7 @@ public: std::unique_ptr visit(const luci::CircleConcatenation *node) override; std::unique_ptr visit(const luci::CircleConv2D *node) override; std::unique_ptr visit(const luci::CircleConst *node) override; + std::unique_ptr visit(const luci::CircleDepthToSpace *node) override; std::unique_ptr visit(const luci::CircleDepthwiseConv2D *node) override; std::unique_ptr visit(const luci::CircleElu *node) override; std::unique_ptr visit(const luci::CircleFullyConnected *node) override; @@ -61,6 +62,8 @@ public: std::unique_ptr visit(const luci::CircleOutput *node) override; std::unique_ptr visit(const luci::CirclePad *node) override; std::unique_ptr visit(const luci::CircleReshape *node) override; + std::unique_ptr visit(const luci::CircleReverseV2 *node) override; + std::unique_ptr visit(const luci::CircleSlice *node) override; std::unique_ptr visit(const luci::CircleSoftmax *node) override; std::unique_ptr visit(const luci::CircleSpaceToDepth *node) override; std::unique_ptr visit(const luci::CircleSplit *node) override; @@ -82,8 +85,8 @@ private: RuntimeGraph *getRuntimeGraph(const loco::Graph *graph) const; private: - const ModuleLoader &_module_loader; - const GraphLoader &_graph_loader; + const std::unordered_map &_graph_to_runtime_graph; + const std::unordered_map &_node_to_tensor; }; } // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp b/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp new file mode 100644 index 0000000..33bc8ec --- /dev/null +++ b/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp @@ -0,0 +1,743 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "loader/GraphLoader.h" +#include "loader/KernelBuilder.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace luci_interpreter +{ +namespace +{ + +using namespace testing; + +class KernelBuilderTest : public Test +{ +protected: + luci::CircleInput *createInputNode() { return createNode(); } + + template NodeT *createNode(Args &&... args) + { + auto *node = _graph.nodes()->create(std::forward(args)...); + // The actual type does not matter for the purpose of the tests. + // NOTE The type is meaningless for nodes with multiple outputs (corresponding *Out nodes carry + // actual output types). + node->dtype(loco::DataType::FLOAT32); + return node; + } + + template NodeOutT *createNodeOut(loco::Node *node, int index) + { + auto *node_out = createNode(); + node_out->input(node); + node_out->index(index); + return node_out; + } + + template std::unique_ptr buildKernel(const luci::CircleNode *op) + { + std::unordered_map graph_to_runtime_graph; + + RuntimeGraph runtime_graph(nullptr); + RuntimeToIR runtime_to_ir; + GraphLoader graph_loader(&_graph, &runtime_graph, runtime_to_ir, graph_to_runtime_graph, + _node_to_tensor); + graph_loader.loadTensors(); + + KernelBuilder kernel_builder(graph_to_runtime_graph, _node_to_tensor); + + auto kernel = op->accept(&kernel_builder); + return std::unique_ptr(dynamic_cast(kernel.release())); + } + + void checkTensor(const Tensor *tensor, const loco::Node *node) + { + EXPECT_THAT(tensor, Eq(_node_to_tensor.at(node))); + } + +private: + loco::Graph _graph; + std::unordered_map _node_to_tensor; +}; + +TEST_F(KernelBuilderTest, Add) +{ + auto *input1 = createInputNode(); + auto *input2 = createInputNode(); + + auto *op = createNode(); + op->x(input1); + op->y(input2); + + op->fusedActivationFunction(luci::FusedActFunc::RELU); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input1(), input1); + checkTensor(kernel->input2(), input2); + checkTensor(kernel->output(), op); + EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction())); +} + +TEST_F(KernelBuilderTest, ArgMax) +{ + auto *input = createInputNode(); + auto *axis = createInputNode(); + + auto *op = createNode(); + op->input(input); + op->dimension(axis); + + op->output_type(loco::DataType::FLOAT32); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input(), input); + checkTensor(kernel->axis(), axis); + checkTensor(kernel->output(), op); + EXPECT_THAT(kernel->params().output_type, Eq(op->output_type())); +} + +TEST_F(KernelBuilderTest, AveragePool2D) +{ + auto *input = createInputNode(); + + auto *op = createNode(); + op->value(input); + + op->padding(luci::Padding::SAME); + op->filter()->h(11); + op->filter()->w(13); + op->stride()->h(17); + op->stride()->w(19); + op->fusedActivationFunction(luci::FusedActFunc::RELU); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input(), input); + checkTensor(kernel->output(), op); + EXPECT_THAT(kernel->params().padding, Eq(op->padding())); + EXPECT_THAT(kernel->params().filter_height, Eq(op->filter()->h())); + EXPECT_THAT(kernel->params().filter_width, Eq(op->filter()->w())); + EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h())); + EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w())); + EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction())); +} + +TEST_F(KernelBuilderTest, Concatenation) +{ + auto *input1 = createInputNode(); + auto *input2 = createInputNode(); + + auto *op = createNode(2); + op->values(0, input1); + op->values(1, input2); + op->axis(11); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input(0), input1); + checkTensor(kernel->input(1), input2); + checkTensor(kernel->output(), op); + EXPECT_THAT(kernel->params().axis, Eq(op->axis())); +} + +TEST_F(KernelBuilderTest, Conv2D) +{ + auto *input = createInputNode(); + auto *filter = createInputNode(); + auto *bias = createInputNode(); + + auto *op = createNode(); + op->input(input); + op->filter(filter); + op->bias(bias); + + op->padding(luci::Padding::SAME); + op->stride()->h(11); + op->stride()->w(13); + op->dilation()->h(17); + op->dilation()->w(19); + op->fusedActivationFunction(luci::FusedActFunc::RELU); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input(), input); + checkTensor(kernel->filter(), filter); + checkTensor(kernel->bias(), bias); + checkTensor(kernel->output(), op); + EXPECT_THAT(kernel->params().padding, Eq(op->padding())); + EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h())); + EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w())); + EXPECT_THAT(kernel->params().dilation_height_factor, Eq(op->dilation()->h())); + EXPECT_THAT(kernel->params().dilation_width_factor, Eq(op->dilation()->w())); + EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction())); +} + +TEST_F(KernelBuilderTest, DepthToSpace) +{ + auto *input = createInputNode(); + + auto *op = createNode(); + op->input(input); + + op->block_size(11); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input(), input); + checkTensor(kernel->output(), op); + EXPECT_THAT(kernel->params().block_size, Eq(op->block_size())); +} + +TEST_F(KernelBuilderTest, DepthwiseConv2D) +{ + auto *input = createInputNode(); + auto *filter = createInputNode(); + auto *bias = createInputNode(); + + auto *op = createNode(); + op->input(input); + op->filter(filter); + op->bias(bias); + + op->padding(luci::Padding::SAME); + op->depthMultiplier(11); + op->stride()->h(13); + op->stride()->w(17); + op->dilation()->h(19); + op->dilation()->w(23); + op->fusedActivationFunction(luci::FusedActFunc::RELU); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input(), input); + checkTensor(kernel->filter(), filter); + checkTensor(kernel->bias(), bias); + checkTensor(kernel->output(), op); + EXPECT_THAT(kernel->params().padding, Eq(op->padding())); + EXPECT_THAT(kernel->params().depth_multiplier, Eq(op->depthMultiplier())); + EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h())); + EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w())); + EXPECT_THAT(kernel->params().dilation_height_factor, Eq(op->dilation()->h())); + EXPECT_THAT(kernel->params().dilation_width_factor, Eq(op->dilation()->w())); + EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction())); +} + +TEST_F(KernelBuilderTest, Elu) +{ + auto *input = createInputNode(); + + auto *op = createNode(); + op->features(input); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input(), input); + checkTensor(kernel->output(), op); +} + +TEST_F(KernelBuilderTest, FullyConnected) +{ + auto *input = createInputNode(); + auto *weights = createInputNode(); + auto *bias = createInputNode(); + + auto *op = createNode(); + op->input(input); + op->weights(weights); + op->bias(bias); + + op->fusedActivationFunction(luci::FusedActFunc::RELU); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input(), input); + checkTensor(kernel->weights(), weights); + checkTensor(kernel->bias(), bias); + checkTensor(kernel->output(), op); + EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction())); +} + +TEST_F(KernelBuilderTest, L2Normalize) +{ + auto *input = createInputNode(); + + auto *op = createNode(); + op->x(input); + + op->fusedActivationFunction(luci::FusedActFunc::RELU); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input(), input); + checkTensor(kernel->output(), op); + EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction())); +} + +TEST_F(KernelBuilderTest, L2Pool2D) +{ + auto *input = createInputNode(); + + auto *op = createNode(); + op->value(input); + + op->padding(luci::Padding::SAME); + op->filter()->h(11); + op->filter()->w(13); + op->stride()->h(17); + op->stride()->w(19); + op->fusedActivationFunction(luci::FusedActFunc::RELU); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input(), input); + checkTensor(kernel->output(), op); + EXPECT_THAT(kernel->params().padding, Eq(op->padding())); + EXPECT_THAT(kernel->params().filter_height, Eq(op->filter()->h())); + EXPECT_THAT(kernel->params().filter_width, Eq(op->filter()->w())); + EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h())); + EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w())); + EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction())); +} + +TEST_F(KernelBuilderTest, LeakyRelu) +{ + auto *input = createInputNode(); + + auto *op = createNode(); + op->features(input); + + op->alpha(11.0f); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input(), input); + checkTensor(kernel->output(), op); + EXPECT_THAT(kernel->params().alpha, Eq(op->alpha())); +} + +TEST_F(KernelBuilderTest, LocalResponseNormalization) +{ + auto *input = createInputNode(); + + auto *op = createNode(); + op->input(input); + + op->radius(11); + op->bias(13.0f); + op->alpha(15.0f); + op->beta(17.0f); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input(), input); + checkTensor(kernel->output(), op); + EXPECT_THAT(kernel->params().radius, Eq(op->radius())); + EXPECT_THAT(kernel->params().bias, Eq(op->bias())); + EXPECT_THAT(kernel->params().alpha, Eq(op->alpha())); + EXPECT_THAT(kernel->params().beta, Eq(op->beta())); +} + +TEST_F(KernelBuilderTest, Logistic) +{ + auto *input = createInputNode(); + + auto *op = createNode(); + op->x(input); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input(), input); + checkTensor(kernel->output(), op); +} + +TEST_F(KernelBuilderTest, MaxPool2D) +{ + auto *input = createInputNode(); + + auto *op = createNode(); + op->value(input); + + op->padding(luci::Padding::SAME); + op->filter()->h(11); + op->filter()->w(13); + op->stride()->h(17); + op->stride()->w(19); + op->fusedActivationFunction(luci::FusedActFunc::RELU); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input(), input); + checkTensor(kernel->output(), op); + EXPECT_THAT(kernel->params().padding, Eq(op->padding())); + EXPECT_THAT(kernel->params().filter_height, Eq(op->filter()->h())); + EXPECT_THAT(kernel->params().filter_width, Eq(op->filter()->w())); + EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h())); + EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w())); + EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction())); +} + +TEST_F(KernelBuilderTest, Mean) +{ + auto *input = createInputNode(); + auto *axes = createInputNode(); + + auto *op = createNode(); + op->input(input); + op->reduction_indices(axes); + + op->keep_dims(true); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input(), input); + checkTensor(kernel->axes(), axes); + checkTensor(kernel->output(), op); + EXPECT_THAT(kernel->params().keep_dims, Eq(op->keep_dims())); +} + +TEST_F(KernelBuilderTest, Mul) +{ + auto *input1 = createInputNode(); + auto *input2 = createInputNode(); + + auto *op = createNode(); + op->x(input1); + op->y(input2); + + op->fusedActivationFunction(luci::FusedActFunc::RELU); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input1(), input1); + checkTensor(kernel->input2(), input2); + checkTensor(kernel->output(), op); + EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction())); +} + +TEST_F(KernelBuilderTest, Pad) +{ + auto *input = createInputNode(); + auto *paddings = createInputNode(); + + auto *op = createNode(); + op->input(input); + op->paddings(paddings); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input(), input); + checkTensor(kernel->paddings(), paddings); + checkTensor(kernel->output(), op); +} + +TEST_F(KernelBuilderTest, Reshape) +{ + auto *input = createInputNode(); + auto *shape = createInputNode(); + + auto *op = createNode(); + op->tensor(input); + op->shape(shape); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input(), input); + checkTensor(kernel->shape(), shape); + checkTensor(kernel->output(), op); +} + +TEST_F(KernelBuilderTest, ReverseV2) +{ + auto *input = createInputNode(); + auto *axes = createInputNode(); + + auto *op = createNode(); + op->tensor(input); + op->axis(axes); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input(), input); + checkTensor(kernel->axes(), axes); + checkTensor(kernel->output(), op); +} + +TEST_F(KernelBuilderTest, Slice) +{ + auto *input = createInputNode(); + auto *begin = createInputNode(); + auto *size = createInputNode(); + + auto *op = createNode(); + op->input(input); + op->begin(begin); + op->size(size); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input(), input); + checkTensor(kernel->begin(), begin); + checkTensor(kernel->size(), size); + checkTensor(kernel->output(), op); +} + +TEST_F(KernelBuilderTest, Softmax) +{ + auto *input = createInputNode(); + + auto *op = createNode(); + op->logits(input); + + op->beta(11.0f); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input(), input); + checkTensor(kernel->output(), op); + EXPECT_THAT(kernel->params().beta, Eq(op->beta())); +} + +TEST_F(KernelBuilderTest, SpaceToDepth) +{ + auto *input = createInputNode(); + + auto *op = createNode(); + op->input(input); + + op->block_size(11); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input(), input); + checkTensor(kernel->output(), op); + EXPECT_THAT(kernel->params().block_size, op->block_size()); +} + +TEST_F(KernelBuilderTest, Split) +{ + auto *axis = createInputNode(); + auto *input = createInputNode(); + auto *op = createNode(); + auto *output1 = createNodeOut(op, 0); + auto *output2 = createNodeOut(op, 1); + + op->split_dim(axis); + op->input(input); + + op->num_split(2); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->axis(), axis); + checkTensor(kernel->input(), input); + checkTensor(kernel->output(0), output1); + checkTensor(kernel->output(1), output2); +} + +TEST_F(KernelBuilderTest, Squeeze) +{ + auto *input = createInputNode(); + + auto *op = createNode(); + op->input(input); + + op->squeeze_dims({11, 13}); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input(), input); + checkTensor(kernel->output(), op); + EXPECT_THAT(kernel->params().squeeze_dims, ElementsAreArray(op->squeeze_dims())); +} + +TEST_F(KernelBuilderTest, StridedSlice) +{ + auto *input = createInputNode(); + auto *begin = createInputNode(); + auto *end = createInputNode(); + auto *strides = createInputNode(); + + auto *op = createNode(); + op->input(input); + op->begin(begin); + op->end(end); + op->strides(strides); + + op->begin_mask(11); + op->ellipsis_mask(13); + op->end_mask(17); + op->new_axis_mask(19); + op->shrink_axis_mask(23); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input(), input); + checkTensor(kernel->begin(), begin); + checkTensor(kernel->end(), end); + checkTensor(kernel->strides(), strides); + checkTensor(kernel->output(), op); + EXPECT_THAT(kernel->params().begin_mask, Eq(op->begin_mask())); + EXPECT_THAT(kernel->params().ellipsis_mask, Eq(op->ellipsis_mask())); + EXPECT_THAT(kernel->params().end_mask, Eq(op->end_mask())); + EXPECT_THAT(kernel->params().new_axis_mask, Eq(op->new_axis_mask())); + EXPECT_THAT(kernel->params().shrink_axis_mask, Eq(op->shrink_axis_mask())); +} + +TEST_F(KernelBuilderTest, Transpose) +{ + auto *input = createInputNode(); + auto *perm = createInputNode(); + + auto *op = createNode(); + op->a(input); + op->perm(perm); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input(), input); + checkTensor(kernel->perm(), perm); + checkTensor(kernel->output(), op); +} + +TEST_F(KernelBuilderTest, TransposeConv) +{ + auto *output_shape = createInputNode(); + auto *filter = createInputNode(); + auto *input = createInputNode(); + + auto *op = createNode(); + op->inputSizes(output_shape); + op->filter(filter); + op->outBackprop(input); + + op->padding(luci::Padding::SAME); + op->stride()->h(11); + op->stride()->w(13); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->output_shape(), output_shape); + checkTensor(kernel->filter(), filter); + checkTensor(kernel->input(), input); + checkTensor(kernel->output(), op); + EXPECT_THAT(kernel->params().padding, Eq(op->padding())); + EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h())); + EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w())); +} + +TEST_F(KernelBuilderTest, Unpack) +{ + auto *input = createInputNode(); + auto *op = createNode(); + auto *output1 = createNodeOut(op, 0); + auto *output2 = createNodeOut(op, 1); + + op->value(input); + + op->num(2); + op->axis(11); + + auto kernel = buildKernel(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->input(), input); + checkTensor(kernel->output(0), output1); + checkTensor(kernel->output(1), output2); + EXPECT_THAT(kernel->params().axis, Eq(op->axis())); +} + +TEST_F(KernelBuilderTest, NonExisting1_NEG) +{ + auto *op = createNode(); + ASSERT_ANY_THROW(buildKernel(op)); +} + +TEST_F(KernelBuilderTest, NonExisting2_NEG) +{ + auto *op = createNode(); + ASSERT_ANY_THROW(buildKernel(op)); +} + +TEST_F(KernelBuilderTest, NonExisting3_NEG) +{ + auto *op = createNode(); + ASSERT_ANY_THROW(buildKernel(op)); +} + +} // namespace +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/loader/ModuleLoader.cpp b/compiler/luci-interpreter/src/loader/ModuleLoader.cpp index 7780a61..b9a2ae0 100644 --- a/compiler/luci-interpreter/src/loader/ModuleLoader.cpp +++ b/compiler/luci-interpreter/src/loader/ModuleLoader.cpp @@ -41,8 +41,11 @@ void ModuleLoader::load() { const loco::Graph *graph = _module->graph(i); RuntimeGraph *runtime_graph = _graph_to_runtime_graph.at(graph); - GraphLoader loader(*this, graph, runtime_graph, _runtime_to_ir, _node_to_tensor); - loader.load(); + GraphLoader loader(graph, runtime_graph, _runtime_to_ir, _graph_to_runtime_graph, + _node_to_tensor); + loader.loadTensors(); + loader.initInputOutputTensors(); + loader.loadOperators(); } } diff --git a/compiler/luci-interpreter/src/loader/ModuleLoader.h b/compiler/luci-interpreter/src/loader/ModuleLoader.h index 954dbfb..1af0ed7 100644 --- a/compiler/luci-interpreter/src/loader/ModuleLoader.h +++ b/compiler/luci-interpreter/src/loader/ModuleLoader.h @@ -36,11 +36,6 @@ public: void load(); - RuntimeGraph *getRuntimeGraph(const loco::Graph *graph) const - { - return _graph_to_runtime_graph.at(graph); - } - private: const luci::Module *_module; RuntimeModule *_runtime_module; diff --git a/compiler/luci-value-test/evalverify.sh b/compiler/luci-value-test/evalverify.sh index dfd55a6..12c9a45 100755 --- a/compiler/luci-value-test/evalverify.sh +++ b/compiler/luci-value-test/evalverify.sh @@ -4,8 +4,10 @@ # # HOW TO USE # -# ./evalverify.sh ... -# work_dir : build directory of luci-value-test (ex: build/compiler/luci-value-test) +# ./evalverify.sh ... +# bin_dir : build directory of luci-value-test (ex: build/compiler/luci-value-test) +# work_dir : artifacts directoy where test materials exist +# venv_dir : python virtual environment home directory VERIFY_SOURCE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" VERIFY_SCRIPT_PATH="${VERIFY_SOURCE_PATH}/luci_eval_verifier.py" diff --git a/compiler/luci-value-test/test.lst b/compiler/luci-value-test/test.lst index 6a332f9..364d881 100644 --- a/compiler/luci-value-test/test.lst +++ b/compiler/luci-value-test/test.lst @@ -1,6 +1,8 @@ #addeval(Abs_000) addeval(Add_000) +#addeval(Add_001) addeval(Add_U8_000) +#addeval(AddN_000) #addeval(ArgMax_000) #addeval(ArgMax_001) #addeval(ArgMax_002) @@ -9,73 +11,173 @@ addeval(Add_U8_000) #addeval(ArgMax_U8_001) #addeval(ArgMax_U8_002) #addeval(ArgMax_U8_003) +#addeval(ArgMin_000) +#addeval(ArgMin_001) +#addeval(ArgMin_002) +#addeval(ArgMin_003) +#addeval(ArgMin_U8_000) +#addeval(ArgMin_U8_001) +#addeval(ArgMin_U8_002) +#addeval(ArgMin_U8_003) addeval(AveragePool2D_000) +#addeval(BatchMatMul_000) #addeval(BatchMatMulV2_000) #addeval(BatchMatMulV2_001) #addeval(BatchToSpaceND_000) #addeval(Cast_000) +#addeval(Cast_001) +#addeval(Ceil_000) addeval(Concatenation_000) addeval(Concatenation_U8_000) addeval(Conv2D_000) addeval(Conv2D_001) addeval(Conv2D_002) +#addeval(Conv2D_003) addeval(Conv2D_U8_000) addeval(Conv2D_U8_001) #addeval(Cos_000) +#addeval(DepthToSpace_000) addeval(DepthwiseConv2D_000) addeval(DepthwiseConv2D_U8_000) +#addeval(DepthwiseConv2D_U8_001) +addeval(DepthwiseConv2D_001) #addeval(Div_000) +#addeval(ELU_000) #addeval(Equal_000) #addeval(Exp_000) +#addeval(ExpandDims_000) +#addeval(ExpandDims_001) +#addeval(ExpandDims_002) +#addeval(ExpandDims_003) +#addeval(Fill_000) +#addeval(Fill_001) +#addeval(Floor_000) +#addeval(FloorDiv_000) +#addeval(FloorDiv_001) +#addeval(FloorMod_000) +#addeval(FloorMod_001) addeval(FullyConnected_000) addeval(FullyConnected_001) #addeval(FullyConnected_002) #addeval(FullyConnected_U8_000) #addeval(Gather_000) +#addeval(GatherNd_000) +#addeval(Greater_000) +#addeval(GreaterEqual_000) #addeval(If_000) #addeval(If_001) +addeval(L2Normalize_000) +addeval(L2Pool2D_000) +#addeval(L2Pool2D_U8_000) +#addeval(LeakyRelu_000) +#addeval(Less_000) +#addeval(LessEqual_000) +#addeval(LocalResponseNormalization_000) +#addeval(Log_000) +#addeval(LogicalAnd_000) #addeval(LogicalNot_000) #addeval(LogicalOr_000) -#addeval(Logistic_000) +addeval(Logistic_000) +#addeval(LogSoftmax_000) +#addeval(MatMul_000) +#addeval(MatrixDiag_000) +#addeval(MatrixSetDiag_000) +#addeval(Maximum_000) addeval(MaxPool2D_000) addeval(MaxPool2D_U8_000) addeval(Mean_000) addeval(Mean_001) addeval(Mean_U8_000) +#addeval(Minimum_000) +#addeval(MirrorPad_000) addeval(Mul_000) #addeval(Mul_U8_000) +#addeval(Neg_000) +#addeval(NotEqual_000) +#addeval(OneHot_000) +#addeval(OneHot_001) +#addeval(OneHot_002) +#addeval(OneHot_003) #addeval(Pack_000) #addeval(Pack_U8_000) addeval(Pad_000) addeval(Pad_U8_000) +#addeval(Pow_000) +#addeval(PRelu_000) +#addeval(Range_000) +#addeval(Rank_000) +#addeval(ReduceAny_000) +#addeval(ReduceAny_001) +#addeval(ReduceAny_002) +#addeval(ReduceAny_003) +#addeval(ReduceMax_000) +#addeval(ReduceMin_000) #addeval(ReduceProd_000) #addeval(ReduceProd_001) #addeval(ReduceProd_002) #addeval(ReduceProd_003) #addeval(ReLU_000) +#addeval(ReLU6_000) +#addeval(ReLUN1To1_000) addeval(Reshape_000) addeval(Reshape_001) addeval(Reshape_002) #addeval(Reshape_003) addeval(Reshape_U8_000) +#addeval(ResizeBilinear_000) +#addeval(ResizeNearestNeighbor_000) +#addeval(ReverseSequence_000) +#addeval(ReverseV2_000) +#addeval(Round_000) #addeval(Rsqrt_000) +#addeval(ScatterNd_000) +#addeval(SegmentSum_000) +#addeval(Select_000) +#addeval(Select_001) +#addeval(Select_002) +#addeval(SelectV2_000) +#addeval(SelectV2_001) +#addeval(SelectV2_002) +#addeval(Shape_000) #addeval(Sin_000) +addeval(Slice_000) addeval(Softmax_000) #addeval(Softmax_U8_000) #addeval(SpaceToBatchND_000) #addeval(SpaceToBatchND_001) #addeval(SpaceToBatchND_002) #addeval(SpaceToBatchND_003) -#addeval(StridedSlice_000) -#addeval(StridedSlice_001) +#addeval(SpaceToDepth_000) +#addeval(SparseToDense_000) +#addeval(Split_000) +#addeval(SplitV_000) +#addeval(Sqrt_000) +#addeval(Square_000) +#addeval(SquaredDifference_000) +addeval(Squeeze_000) +addeval(StridedSlice_000) +addeval(StridedSlice_001) +addeval(StridedSlice_002) #addeval(Sub_000) #addeval(Sub_U8_000) +#addeval(Sum_000) +#addeval(Sum_001) #addeval(Tanh_000) #addeval(Tile_000) #addeval(Tile_U8_000) -#addeval(Transpose_000) +#addeval(TopKV2_000) +#addeval(TopKV2_001) +addeval(Transpose_000) +#addeval(TransposeConv_000) #addeval(Unpack_000) #addeval(Unpack_001) #addeval(Unpack_002) +addeval(Unpack_003) +#addeval(Where_000) +#addeval(Where_001) #addeval(While_000) #addeval(While_001) +#addeval(While_002) +#addeval(While_003) +#addeval(YUV_TO_RGB_U8_000) +#addeval(ZerosLike_000) diff --git a/compiler/luci/export/src/CircleOperationExporter.cpp b/compiler/luci/export/src/CircleOperationExporter.cpp index 3c01b67..344c99f 100644 --- a/compiler/luci/export/src/CircleOperationExporter.cpp +++ b/compiler/luci/export/src/CircleOperationExporter.cpp @@ -890,7 +890,7 @@ void OperationExporter::visit(luci::CircleSpaceToDepth *node) { export_simple(node, circle::BuiltinOperator_SPACE_TO_DEPTH, circle::BuiltinOptions_SpaceToDepthOptions, - CreateSpaceToDepthOptions(builder).Union()); + CreateSpaceToDepthOptions(builder, node->block_size()).Union()); } void OperationExporter::visit(luci::CircleSparseToDense *node) diff --git a/compiler/luci/export/src/CircleTensorExporter.cpp b/compiler/luci/export/src/CircleTensorExporter.cpp index 5cad392..dc8c2fb 100644 --- a/compiler/luci/export/src/CircleTensorExporter.cpp +++ b/compiler/luci/export/src/CircleTensorExporter.cpp @@ -302,7 +302,10 @@ encodeQuantizationParameters(FlatBufferBuilder &builder, luci::CircleQuantParam scale = builder.CreateVector(quantparam->scale); zero_point = builder.CreateVector(quantparam->zerop); } - return circle::CreateQuantizationParameters(builder, min, max, scale, zero_point); + // Note: QuantizationDetails is not supported + return circle::CreateQuantizationParameters(builder, min, max, scale, zero_point, + circle::QuantizationDetails::QuantizationDetails_NONE, + 0, quantparam->quantized_dimension); } void exportOpDefinedTensor(const CircleTensoInfo &info, FlatBufferBuilder &builder, diff --git a/compiler/luci/import/src/CircleReader.cpp b/compiler/luci/import/src/CircleReader.cpp index 81e945d..bc7f397 100644 --- a/compiler/luci/import/src/CircleReader.cpp +++ b/compiler/luci/import/src/CircleReader.cpp @@ -156,6 +156,7 @@ luci_quantparam(const circle::QuantizationParametersT *quantization) const auto &max = quantization->max; const auto &scale = quantization->scale; const auto &zero_point = quantization->zero_point; + const auto &quantized_dimension = quantization->quantized_dimension; if ((!min.empty() && !max.empty()) || (!scale.empty() && !zero_point.empty())) { @@ -165,6 +166,7 @@ luci_quantparam(const circle::QuantizationParametersT *quantization) quantparam->max = max; quantparam->scale = scale; quantparam->zerop = zero_point; + quantparam->quantized_dimension = quantized_dimension; return quantparam; } diff --git a/compiler/luci/import/src/Importer.test.cpp b/compiler/luci/import/src/Importer.test.cpp index 4426e15..8366546 100644 --- a/compiler/luci/import/src/Importer.test.cpp +++ b/compiler/luci/import/src/Importer.test.cpp @@ -20,4 +20,9 @@ #include -TEST(TensorFlowLiteImport, Dummy) { luci::Importer import; } +TEST(TensorFlowLiteImport, Dummy) +{ + luci::Importer import; + + SUCCEED(); +} diff --git a/compiler/luci/import/src/Nodes/CircleLogistic.cpp b/compiler/luci/import/src/Nodes/CircleLogistic.cpp index 85e7e55..c77c55e 100644 --- a/compiler/luci/import/src/Nodes/CircleLogistic.cpp +++ b/compiler/luci/import/src/Nodes/CircleLogistic.cpp @@ -32,21 +32,7 @@ bool CircleLogisticGraphBuilder::validate(const ValidateArgs &args) const if (outputs.size() != 1) return false; - // Must be one of the following types - // float16, float32, float64, complex64, or complex128 const auto &tensors = args.reader.tensors(); - const auto &tensor = tensors.at(inputs[0]); - switch (tensor->type) - { - case circle::TensorType_FLOAT16: - case circle::TensorType_FLOAT32: - case circle::TensorType_FLOAT64: - case circle::TensorType_COMPLEX64: - break; - default: - return false; - } - if (tensors.at(inputs[0])->type != tensors.at(outputs[0])->type) return false; diff --git a/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp b/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp index 7bdf46d..eb0956c 100644 --- a/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp +++ b/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp @@ -30,6 +30,24 @@ bool CircleTransposeConvGraphBuilder::validate(const ValidateArgs &args) const if (args.op.inputs.size() != 3) return false; + const auto &inputs = args.op.inputs; + const auto &tensors = args.reader.tensors(); + const auto &filter_tensor = tensors.at(inputs[1]); + const auto &filter_shape = filter_tensor.get()->shape; + const auto &ifm_tensor = tensors.at(inputs[2]); + const auto &ifm_shape = ifm_tensor.get()->shape; + + // ifm and filters must be 4-D tensor + if (ifm_shape.size() != 4) + return false; + if (filter_shape.size() != 4) + return false; + + // input shape : [batch, height, width, in_channels] + // filters shape : [output_channels, height, weight, in_channels] + if (ifm_tensor.get()->shape.at(3) != filter_tensor.get()->shape.at(3)) + return false; + return true; } diff --git a/compiler/luci/lang/include/luci/IR/CircleNodes.lst b/compiler/luci/lang/include/luci/IR/CircleNodes.lst index 488dcfb..acd7921 100644 --- a/compiler/luci/lang/include/luci/IR/CircleNodes.lst +++ b/compiler/luci/lang/include/luci/IR/CircleNodes.lst @@ -120,6 +120,7 @@ CIRCLE_NODE(BCQ_FULLY_CONNECTED, luci::CircleBCQFullyConnected) CIRCLE_NODE(BCQ_GATHER, luci::CircleBCQGather) CIRCLE_NODE(INSTANCE_NORM, luci::CircleInstanceNorm) // Virtual node(s) +CIRCLE_NODE(CIRCLECONST, void) CIRCLE_NODE(CIRCLEINPUT, luci::CircleInput) CIRCLE_NODE(CIRCLEOUTPUT, luci::CircleOutput) CIRCLE_NODE(CIRCLEOUTPUTDUMMY, luci::CircleOutputDummy) diff --git a/compiler/luci/lang/include/luci/IR/CircleQuantParam.h b/compiler/luci/lang/include/luci/IR/CircleQuantParam.h index 7253e65..6944373 100644 --- a/compiler/luci/lang/include/luci/IR/CircleQuantParam.h +++ b/compiler/luci/lang/include/luci/IR/CircleQuantParam.h @@ -29,6 +29,7 @@ struct CircleQuantParam std::vector max; std::vector scale; std::vector zerop; + int32_t quantized_dimension{0}; }; } // namespace luci diff --git a/compiler/luci/lang/src/Module.test.cpp b/compiler/luci/lang/src/Module.test.cpp index 26bf073..a5973e5 100644 --- a/compiler/luci/lang/src/Module.test.cpp +++ b/compiler/luci/lang/src/Module.test.cpp @@ -22,7 +22,7 @@ TEST(ModuleTest, consturctor) { auto gs = luci::make_module(); - GTEST_SUCCEED(); + SUCCEED(); } TEST(ModuleTest, add) diff --git a/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp b/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp index 74ea82c..c07268c 100644 --- a/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp +++ b/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp @@ -35,7 +35,12 @@ TEST(CircleCustomTest, constructor) ASSERT_EQ(0, custom_node.custom_code().size()); } -TEST(CircleCustomTest, constructor_NEG) { ASSERT_DEBUG_DEATH(luci::CircleCustom{0}, ""); } +TEST(CircleCustomTest, constructor_NEG) +{ + ASSERT_DEBUG_DEATH(luci::CircleCustom{0}, ""); + + SUCCEED(); +} TEST(CircleCustomTest, invalidIndex_NEG) { diff --git a/compiler/luci/lang/src/Nodes/CircleIf.test.cpp b/compiler/luci/lang/src/Nodes/CircleIf.test.cpp index e3c8c9f..35f28e9 100644 --- a/compiler/luci/lang/src/Nodes/CircleIf.test.cpp +++ b/compiler/luci/lang/src/Nodes/CircleIf.test.cpp @@ -41,11 +41,15 @@ TEST(CircleIfTest, constructor) TEST(CircleIfTestDeath, invalid_arity_NEG) { ASSERT_DEBUG_DEATH(luci::CircleIf very_long_name_if_node(0, 1), ""); + + SUCCEED(); } TEST(CircleIfTestDeath, invalid_output_count_NEG) { ASSERT_DEBUG_DEATH(luci::CircleIf if_node(2, 0), ""); + + SUCCEED(); } TEST(CircleIfTestDeath, invalid_input_get_index_NEG) diff --git a/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp b/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp index 19290c0..913686f 100644 --- a/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp +++ b/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp @@ -41,11 +41,15 @@ TEST(CircleWhileTest, constructor) TEST(CircleWhileTestDeath, invalid_arity_NEG) { ASSERT_DEBUG_DEATH(luci::CircleWhile very_long_name_while_node(0, 1), ""); + + SUCCEED(); } TEST(CircleWhileTestDeath, invalid_output_count_NEG) { ASSERT_DEBUG_DEATH(luci::CircleWhile while_node(2, 0), ""); + + SUCCEED(); } TEST(CircleWhileTestDeath, invalid_input_get_index_NEG) diff --git a/compiler/luci/pass/src/CircleOptimizer.cpp b/compiler/luci/pass/src/CircleOptimizer.cpp index 90fbe90..2edf7a9 100644 --- a/compiler/luci/pass/src/CircleOptimizer.cpp +++ b/compiler/luci/pass/src/CircleOptimizer.cpp @@ -145,7 +145,7 @@ void CircleOptimizer::quantize(loco::Graph *g) const { static const std::vector fakeq_supported_input_dtype{"float32"}; static const std::vector fakeq_supported_output_dtype{"uint8"}; - static const std::vector fakeq_supported_granularity{"layer"}; + static const std::vector fakeq_supported_granularity{"layer", "channel"}; auto input_dtype = _options->param(Options::AlgorithmParameters::Quantize_input_dtype); auto output_dtype = _options->param(Options::AlgorithmParameters::Quantize_output_dtype); @@ -173,7 +173,7 @@ void CircleOptimizer::quantize(loco::Graph *g) const { static const std::vector qwmm_supported_input_dtype{"float32"}; static const std::vector qwmm_supported_output_dtype{"uint8"}; - static const std::vector qwmm_supported_granularity{"layer"}; + static const std::vector qwmm_supported_granularity{"layer", "channel"}; auto input_dtype = _options->param(Options::AlgorithmParameters::Quantize_input_dtype); auto output_dtype = _options->param(Options::AlgorithmParameters::Quantize_output_dtype); diff --git a/compiler/luci/pass/src/FuseBCQPass.cpp b/compiler/luci/pass/src/FuseBCQPass.cpp index b81db88..edbaefa 100644 --- a/compiler/luci/pass/src/FuseBCQPass.cpp +++ b/compiler/luci/pass/src/FuseBCQPass.cpp @@ -67,14 +67,190 @@ const std::string node_name_prefix(luci::NodeName node_name) return prefix; } +/** + * @brief Create CircleOutputExclude operation, which has same shape and dtype with + * original circle_node. + */ +luci::CircleOutputExclude *createNoOp(luci::CircleNode *circle_node) +{ + auto graph = circle_node->graph(); + auto noOp = graph->nodes()->create(); + + if (circle_node->shape_status() == luci::ShapeStatus::VALID) + { + noOp->dtype(circle_node->dtype()); + noOp->rank(circle_node->rank()); + for (uint32_t i = 0; i < circle_node->rank(); ++i) + noOp->dim(i) = circle_node->dim(i); + } + else + { + // For type inference + noOp->dtype(loco::DataType::FLOAT32); + } + + return noOp; +}; + } // namespace namespace { -class BCQConverter final +// V means the version of BCQ. +template class BCQFuser; + +template <> class BCQFuser<1> { public: + bool fuseBCQ(loco::Graph *g) + { + bool changed = false; + + for (auto node : loco::all_nodes(g)) + { + if (auto circle_const = dynamic_cast(node)) + { + add_BCQ_info_node(circle_const); + } + } + + if (!is_bcqinfo_valid()) + return false; + + for (auto node : loco::active_nodes(loco::output_nodes(g))) + { + if (auto gather = dynamic_cast(node)) + { + auto params = dynamic_cast(gather->params()); + if (params != nullptr && has_BCQ_info(params)) + { + auto bcq_gather = g->nodes()->create(); + + bcq_gather->op_version(1); + bcq_gather->input_scales(get_alpha(params)); + bcq_gather->input_binary(get_packed_binary_code(params)); + bcq_gather->indices(gather->indices()); + bcq_gather->input_clusters(packed_clusters(params)); + + // input_binary shape : [output_size, hidden_size] + const auto binary_hidden_size = + loco::must_cast(bcq_gather->input_binary())->dim(1).value() * 32; + bcq_gather->input_hidden_size(binary_hidden_size); + + if (do_w_x(params)) + { + bcq_gather->axis(gather->axis()); + } + else + { + const auto axis_transpose = (gather->axis() == 0) ? 1 : 0; + bcq_gather->axis(axis_transpose); + } + + loco::replace(gather).with(bcq_gather); + + changed = true; + } + } + else if (auto fully_connected = dynamic_cast(node)) + { + auto weights = dynamic_cast(fully_connected->weights()); + if (weights != nullptr && has_BCQ_info(weights)) + { + auto bcq_fc = g->nodes()->create(); + + bcq_fc->op_version(1); + bcq_fc->weights_scales(get_alpha(weights)); + bcq_fc->weights_binary(get_packed_binary_code(weights)); + bcq_fc->bias(fully_connected->bias()); + bcq_fc->weights_clusters(packed_clusters(weights)); + bcq_fc->fusedActivationFunction(fully_connected->fusedActivationFunction()); + + loco::Node *bcq_input = fully_connected->input(); + int32_t batch_rank = 0; + + // If input of BCQFullyConnected has more than rank 2, we should reshape it as rank 2 + const auto original_input = loco::must_cast(fully_connected->input()); + if (original_input->shape_status() == luci::ShapeStatus::VALID && + original_input->rank() > 2) + { + auto new_shape = g->nodes()->create(); + new_shape->dtype(loco::DataType::S32); + new_shape->size(2); + new_shape->rank(1); + new_shape->dim(0) = 2; + + auto batch_size = 1; + for (uint32_t i = 0; i < original_input->rank() - 1; ++i) + batch_size *= original_input->dim(i).value(); + + new_shape->at(0) = batch_size; + new_shape->at(1) = + original_input->dim(original_input->rank() - 1).value(); + new_shape->shape_status(luci::ShapeStatus::VALID); + + auto reshape = g->nodes()->create(); + reshape->tensor(original_input); + reshape->shape(new_shape); + + bcq_input = reshape; + batch_rank = original_input->rank() - 2; + } + + // If x_w formation, we should insert Transpose in front and back of BCQFullyConnected + if (do_w_x(weights)) + { + const auto binary_hidden_size = + loco::must_cast(fully_connected->input()) + ->dim(batch_rank) + .value(); + bcq_fc->weights_hidden_size(binary_hidden_size); + bcq_fc->input(bcq_input); + loco::replace(fully_connected).with(bcq_fc); + } + else + { + const auto binary_hidden_size = + loco::must_cast(fully_connected->input()) + ->dim(1 + batch_rank) + .value(); + bcq_fc->weights_hidden_size(binary_hidden_size); + + auto perm = g->nodes()->create(); + perm->dtype(loco::DataType::S32); + perm->size(2); + perm->rank(1); + perm->dim(0) = 2; + perm->at(0) = 1; + perm->at(1) = 0; + perm->shape_status(luci::ShapeStatus::VALID); + + auto input_transpose = g->nodes()->create(); + input_transpose->a(bcq_input); + input_transpose->perm(perm); + + bcq_fc->input(input_transpose); + + auto output_transpose = g->nodes()->create(); + output_transpose->a(bcq_fc); + output_transpose->perm(perm); + + loco::replace(fully_connected).with(output_transpose); + } + + changed = true; + } + } + } + + if (changed) + clear_BCQ_nodes(); + + return changed; + } + +private: void add_BCQ_info_node(luci::CircleConst *node) { const auto node_name = node->name(); @@ -119,16 +295,65 @@ public: return has_info; } + /** + * @brief Exclude BCQ information nodes which are used for fusing BCQ operations + * from graph output by using CircleOutputExclude + */ + void clear_BCQ_nodes() + { + auto clear_nodes = [](std::map &nodes) { + for (auto &n : nodes) + { + auto node = n.second; + + for (auto s : loco::succs(node)) + { + if (auto outnode = dynamic_cast(s)) + { + outnode->from(createNoOp(node)); + } + else if (auto reshape_node = dynamic_cast(s)) + { + for (auto o : loco::succs(reshape_node)) + { + auto circle_output = loco::must_cast(o); + circle_output->from(createNoOp(reshape_node)); + } + } + } + } + }; + + clear_nodes(_do_w_x); + clear_nodes(_alpha); + clear_nodes(_packed_binary_code); + clear_nodes(_number_of_clusters); + clear_nodes(_size_of_clusters); + clear_nodes(_qbits_of_clusters); + clear_nodes(_dequant_weight); + } + + bool is_bcqinfo_valid() + { + // do_w_x should be int32 or bool type + for (auto n : _do_w_x) + { + if (n.second->dtype() != loco::DataType::BOOL && n.second->dtype() != loco::DataType::S32) + return false; + } + + return true; + } + +private: bool do_w_x(luci::CircleConst *node) { const auto prefix = node_name_prefix(node->name()); if (_do_w_x[prefix]->dtype() == loco::DataType::S32) return _do_w_x[prefix]->at(0) == 1; - else if (_do_w_x[prefix]->dtype() == loco::DataType::BOOL) - return _do_w_x[prefix]->at(0); else - throw std::runtime_error("do_w_x should be int or bool"); + return _do_w_x[prefix]->at(0); } luci::CircleConst *get_alpha(luci::CircleConst *node) @@ -187,64 +412,6 @@ public: return packed_clusters; } - /** - * @brief Exclude BCQ information nodes which are used for fusing BCQ operations - * from graph output by using CircleOutputExclude - */ - void clear_BCQ_nodes() - { - auto createNoOp = [](luci::CircleNode *circle_node) { - auto graph = circle_node->graph(); - auto noOp = graph->nodes()->create(); - - if (circle_node->shape_status() == luci::ShapeStatus::VALID) - { - noOp->dtype(circle_node->dtype()); - noOp->rank(circle_node->rank()); - for (uint32_t i = 0; i < circle_node->rank(); ++i) - noOp->dim(i) = circle_node->dim(i); - } - else - { - // For type inference - noOp->dtype(loco::DataType::FLOAT32); - } - - return noOp; - }; - - auto clear_nodes = [createNoOp](std::map &nodes) { - for (auto &n : nodes) - { - auto node = n.second; - - for (auto s : loco::succs(node)) - { - if (auto outnode = dynamic_cast(s)) - { - outnode->from(createNoOp(node)); - } - else if (auto reshape_node = dynamic_cast(s)) - { - for (auto o : loco::succs(reshape_node)) - { - auto circle_output = loco::must_cast(o); - circle_output->from(createNoOp(reshape_node)); - } - } - } - } - }; - - clear_nodes(_do_w_x); - clear_nodes(_alpha); - clear_nodes(_packed_binary_code); - clear_nodes(_number_of_clusters); - clear_nodes(_size_of_clusters); - clear_nodes(_qbits_of_clusters); - clear_nodes(_dequant_weight); - } - private: std::map _do_w_x; std::map _alpha; @@ -262,142 +429,9 @@ namespace luci bool FuseBCQPass::run(loco::Graph *g) { - BCQConverter converter; - bool changed = false; - for (auto node : loco::all_nodes(g)) - { - if (auto circle_const = dynamic_cast(node)) - { - converter.add_BCQ_info_node(circle_const); - } - } - - for (auto node : loco::active_nodes(loco::output_nodes(g))) - { - if (auto gather = dynamic_cast(node)) - { - auto params = dynamic_cast(gather->params()); - if (params != nullptr && converter.has_BCQ_info(params)) - { - auto bcq_gather = g->nodes()->create(); - - bcq_gather->input_scales(converter.get_alpha(params)); - bcq_gather->input_binary(converter.get_packed_binary_code(params)); - bcq_gather->indices(gather->indices()); - bcq_gather->input_clusters(converter.packed_clusters(params)); - - const auto binary_hidden_size = - loco::must_cast(bcq_gather->input_binary())->dim(1).value() * 32; - bcq_gather->input_hidden_size(binary_hidden_size); - - if (converter.do_w_x(params)) - { - bcq_gather->axis(gather->axis()); - } - else - { - const auto axis_transpose = (gather->axis() == 0) ? 1 : 0; - bcq_gather->axis(axis_transpose); - } - - loco::replace(gather).with(bcq_gather); - - changed = true; - } - } - else if (auto fully_connected = dynamic_cast(node)) - { - auto weights = dynamic_cast(fully_connected->weights()); - if (weights != nullptr && converter.has_BCQ_info(weights)) - { - auto bcq_fc = g->nodes()->create(); - - bcq_fc->weights_scales(converter.get_alpha(weights)); - bcq_fc->weights_binary(converter.get_packed_binary_code(weights)); - bcq_fc->bias(fully_connected->bias()); - bcq_fc->weights_clusters(converter.packed_clusters(weights)); - bcq_fc->fusedActivationFunction(fully_connected->fusedActivationFunction()); - - loco::Node *bcq_input = fully_connected->input(); - int32_t batch_rank = 0; - - // If input of BCQFullyConnected has more than rank 2, we should reshape it as rank 2 - const auto original_input = loco::must_cast(fully_connected->input()); - if (original_input->shape_status() == ShapeStatus::VALID && original_input->rank() > 2) - { - auto new_shape = g->nodes()->create(); - new_shape->dtype(loco::DataType::S32); - new_shape->size(2); - new_shape->rank(1); - new_shape->dim(0) = 2; - - auto batch_size = 1; - for (uint32_t i = 0; i < original_input->rank() - 1; ++i) - batch_size *= original_input->dim(i).value(); - - new_shape->at(0) = batch_size; - new_shape->at(1) = - original_input->dim(original_input->rank() - 1).value(); - new_shape->shape_status(ShapeStatus::VALID); - - auto reshape = g->nodes()->create(); - reshape->tensor(original_input); - reshape->shape(new_shape); - - bcq_input = reshape; - batch_rank = original_input->rank() - 2; - } - - // If x_w formation, we should insert Transpose in front and back of BCQFullyConnected - if (converter.do_w_x(weights)) - { - const auto binary_hidden_size = - loco::must_cast(fully_connected->input()) - ->dim(batch_rank) - .value(); - bcq_fc->weights_hidden_size(binary_hidden_size); - bcq_fc->input(bcq_input); - loco::replace(fully_connected).with(bcq_fc); - } - else - { - const auto binary_hidden_size = - loco::must_cast(fully_connected->input()) - ->dim(1 + batch_rank) - .value(); - bcq_fc->weights_hidden_size(binary_hidden_size); - - auto perm = g->nodes()->create(); - perm->dtype(loco::DataType::S32); - perm->size(2); - perm->rank(1); - perm->dim(0) = 2; - perm->at(0) = 1; - perm->at(1) = 0; - perm->shape_status(ShapeStatus::VALID); - - auto input_transpose = g->nodes()->create(); - input_transpose->a(bcq_input); - input_transpose->perm(perm); - - bcq_fc->input(input_transpose); - - auto output_transpose = g->nodes()->create(); - output_transpose->a(bcq_fc); - output_transpose->perm(perm); - - loco::replace(fully_connected).with(output_transpose); - } - - changed = true; - } - } - } - - if (changed) - converter.clear_BCQ_nodes(); + changed = BCQFuser<1>().fuseBCQ(g); return changed; } diff --git a/compiler/luci/pass/src/QuantizationUtils.cpp b/compiler/luci/pass/src/QuantizationUtils.cpp index 6726ce7..9c9e741 100644 --- a/compiler/luci/pass/src/QuantizationUtils.cpp +++ b/compiler/luci/pass/src/QuantizationUtils.cpp @@ -99,6 +99,13 @@ void compute_asym_scale_zp(float min, float max, float &scaling_factor, int64_t nudged_zero_point = static_cast(std::round(zero_point_double)); } + // protect scale from being very low due to overflow + if (scale < 1e-5) + { + scale = 1e-5; + nudged_zero_point = static_cast(std::round(qmin_double - rmin / scale)); + } + nudged_min = static_cast((qmin_double - nudged_zero_point) * scale); nudged_max = static_cast((qmax_double - nudged_zero_point) * scale); diff --git a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp index f8abee7..2264bd7 100644 --- a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp +++ b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp @@ -138,7 +138,8 @@ bool is_quantized(const CircleNode *node) node->dtype() == loco::DataType::S32; // bias } -void sym_wquant_per_channel(CircleConst *node, std::vector &scaling_factor) +void sym_wquant_per_channel(CircleConst *node, std::vector &scaling_factor, + int32_t &channel_dim_index) { assert(node->dtype() == loco::DataType::FLOAT32); @@ -153,7 +154,6 @@ void sym_wquant_per_channel(CircleConst *node, std::vector &scaling_facto uint32_t indices[4] = { 0, }; - int channel_dim_index{0}; if (!get_channel_dim_index(node, dimension, channel_dim_index)) { @@ -189,7 +189,7 @@ void sym_wquant_per_channel(CircleConst *node, std::vector &scaling_facto } void asym_wquant_per_channel(CircleConst *node, std::vector &min, - std::vector &scaling_factor) + std::vector &scaling_factor, int32_t &channel_dim_index) { assert(node->dtype() == loco::DataType::FLOAT32); @@ -204,7 +204,6 @@ void asym_wquant_per_channel(CircleConst *node, std::vector &min, uint32_t indices[4] = { 0, }; - int channel_dim_index{0}; if (!get_channel_dim_index(node, dimension, channel_dim_index)) { @@ -350,8 +349,8 @@ struct QuantizeActivation final : public luci::CircleNodeMutableVisitor circle_node->dtype(loco::DataType::S16); } - circle_node->quantparam()->max[0] = nudged_max; - circle_node->quantparam()->min[0] = nudged_min; + circle_node->quantparam()->min.clear(); + circle_node->quantparam()->max.clear(); circle_node->quantparam()->scale.push_back(scaling_factor); circle_node->quantparam()->zerop.push_back(zp); } @@ -472,15 +471,19 @@ struct QuantizeWeights final : public luci::CircleNodeMutableVisitor assert(quantparam != nullptr); auto min = quantparam->min; auto scaling_factor = quantparam->scale; + int32_t channel_dim_index = 0; if (output_type == loco::DataType::U8) { - asym_wquant_per_channel(circle_const, min, scaling_factor); + asym_wquant_per_channel(circle_const, min, scaling_factor, channel_dim_index); } else { - sym_wquant_per_channel(circle_const, scaling_factor); + sym_wquant_per_channel(circle_const, scaling_factor, channel_dim_index); } + quantparam->min.clear(); + quantparam->max.clear(); + quantparam->quantized_dimension = channel_dim_index; } // Find min/max per layer-wise else @@ -493,6 +496,8 @@ struct QuantizeWeights final : public luci::CircleNodeMutableVisitor auto min = quantparam->min[0]; auto scaling_factor = quantparam->scale[0]; asym_wquant_per_layer(circle_const, min, scaling_factor); + quantparam->min.clear(); + quantparam->max.clear(); } } } diff --git a/compiler/luci/tests/test.lst b/compiler/luci/tests/test.lst index 188e298..3da3437 100644 --- a/compiler/luci/tests/test.lst +++ b/compiler/luci/tests/test.lst @@ -30,13 +30,16 @@ addread(Ceil_000) addread(Concatenation_000) addread(Concatenation_U8_000) addread(Conv2D_000) +addread(Conv2D_001) addread(Conv2D_002) addread(Conv2D_003) addread(Conv2D_U8_000) +addread(Conv2D_U8_001) addread(Cos_000) addread(DepthToSpace_000) addread(DepthwiseConv2D_000) addread(DepthwiseConv2D_U8_000) +addread(DepthwiseConv2D_U8_001) addread(DepthwiseConv2D_001) addread(Div_000) addread(ELU_000) @@ -84,6 +87,7 @@ addread(MaxPool2D_000) addread(MaxPool2D_U8_000) addread(Mean_000) addread(Mean_001) +addread(Mean_U8_000) addread(Minimum_000) addread(MirrorPad_000) addread(Mul_000) @@ -97,6 +101,7 @@ addread(OneHot_003) addread(Pack_000) addread(Pack_U8_000) addread(Pad_000) +addread(Pad_U8_000) addread(Pow_000) addread(PRelu_000) addread(Range_000) @@ -222,13 +227,16 @@ addwrite(Ceil_000) addwrite(Concatenation_000) addwrite(Concatenation_U8_000) addwrite(Conv2D_000) +addwrite(Conv2D_001) addwrite(Conv2D_002) addwrite(Conv2D_003) addwrite(Conv2D_U8_000) +addwrite(Conv2D_U8_001) addwrite(Cos_000) addwrite(DepthToSpace_000) addwrite(DepthwiseConv2D_000) addwrite(DepthwiseConv2D_U8_000) +addwrite(DepthwiseConv2D_U8_001) addwrite(DepthwiseConv2D_001) addwrite(Div_000) addwrite(ELU_000) @@ -276,6 +284,7 @@ addwrite(MaxPool2D_000) addwrite(MaxPool2D_U8_000) addwrite(Mean_000) addwrite(Mean_001) +addwrite(Mean_U8_000) addwrite(Minimum_000) addwrite(MirrorPad_000) addwrite(Mul_000) diff --git a/compiler/one-cmds/one-codegen b/compiler/one-cmds/one-codegen index 2c80664..820b6d8 100644 --- a/compiler/one-cmds/one-codegen +++ b/compiler/one-cmds/one-codegen @@ -18,7 +18,7 @@ DRIVER_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" function Usage() { - echo "Usage: $0 [BACKEND] ..." + echo "Usage: one-codegen [BACKEND] ..." echo "Available BACKEND drivers:" backend_exist=0 for file in `find $DRIVER_PATH -name *-compile -type f`; @@ -33,23 +33,34 @@ function Usage() if [ $backend_exist == 0 ]; then echo " (There is no available backend drivers)" fi + + exit 255 } -# Get command from command-line -BACKEND=$1; shift -BACKEND_DRIVER="$BACKEND-compile" +function version() +{ + $DRIVER_PATH/one-version one-codegen + exit 255 +} -if [[ -z "${BACKEND_DRIVER}" ]]; then +# Get command from command-line +BACKEND=$1 +if [[ -z ${BACKEND} ]]; then Usage - exit 255 fi +shift + +if [[ "${BACKEND}" == "--version" ]]; then + version +fi + +BACKEND_DRIVER="${BACKEND}-compile" BACKEND_DRIVER_CMD="${DRIVER_PATH}/${BACKEND_DRIVER}" if [[ ! -f "${BACKEND_DRIVER_CMD}" ]]; then echo "ERROR: '${BACKEND_DRIVER}' is not supported" Usage - exit 255 fi "${BACKEND_DRIVER_CMD}" "$@" diff --git a/compiler/one-cmds/one-import b/compiler/one-cmds/one-import index dbf4af5..b1dd8f4 100644 --- a/compiler/one-cmds/one-import +++ b/compiler/one-cmds/one-import @@ -18,7 +18,7 @@ DRIVER_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" function Usage() { - echo "Usage: $0 [FRAMEWORK] ..." + echo "Usage: one-import [FRAMEWORK] ..." echo "Available FRAMEWORK drivers:" framework_exist=0 for file in "$DRIVER_PATH"/one-import-*; @@ -31,23 +31,34 @@ function Usage() if [ $framework_exist == 0 ]; then echo " (There is no available import drivers)" fi + + exit 255 } -# Get command from command-line -FRAMEWORK=$1; shift -FRAMEWORK_DRIVER="one-import-$FRAMEWORK" +function version() +{ + $DRIVER_PATH/one-version one-import-tf + exit 255 +} -if [[ -z "${FRAMEWORK_DRIVER}" ]]; then +# Get command from command-line +FRAMEWORK=$1 +if [[ -z ${FRAMEWORK} ]]; then Usage - exit 255 +fi +shift + +if [ ${FRAMEWORK} = "--version" ]; then + version fi +FRAMEWORK_DRIVER="one-import-$FRAMEWORK" + FRAMEWORK_DRIVER_CMD="${DRIVER_PATH}/${FRAMEWORK_DRIVER}" if [[ ! -f "${FRAMEWORK_DRIVER_CMD}" ]]; then echo "ERROR: '${FRAMEWORK_DRIVER}' is not supported" Usage - exit 255 fi "${FRAMEWORK_DRIVER_CMD}" "$@" diff --git a/compiler/one-cmds/one-import-tf b/compiler/one-cmds/one-import-tf index c048a4e..d59e1c5 100644 --- a/compiler/one-cmds/one-import-tf +++ b/compiler/one-cmds/one-import-tf @@ -22,14 +22,24 @@ usage() { echo "Convert TensorFlow model to circle." echo "Usage: one-import-tf" + echo " --version Show version information and exit" echo " --input_path " echo " --output_path " echo " --input_arrays " echo " --input_shapes " echo " --output_arrays " - exit 0 + echo " --v2 Use TensorFlow 2.x interface (default is 1.x interface)" + exit 255 } +version() +{ + $DRIVER_PATH/one-version one-import-tf + exit 255 +} + +TF_INTERFACE="--v1" + # Parse command-line arguments # while [ "$#" -ne 0 ]; do @@ -39,6 +49,9 @@ while [ "$#" -ne 0 ]; do '--help') usage ;; + '--version') + version + ;; '--input_path') export INPUT_PATH="$2" shift 2 @@ -59,6 +72,10 @@ while [ "$#" -ne 0 ]; do export OUTPUT_ARRAYS="$2" shift 2 ;; + '--v2') + TF_INTERFACE="--v2" + shift + ;; *) echo "Unknown parameter: ${CUR}" shift @@ -92,14 +109,21 @@ fi # remove previous log rm -rf "${OUTPUT_PATH}.log" +show_err_onexit() +{ + cat "${OUTPUT_PATH}.log" +} + +trap show_err_onexit ERR + # generate temporary tflite file -echo "python" "${DRIVER_PATH}/tf2tfliteV2.py" --v2 --input_path ${INPUT_PATH} \ +echo "python" "${DRIVER_PATH}/tf2tfliteV2.py" ${TF_INTERFACE} --input_path ${INPUT_PATH} \ --input_arrays ${INPUT_ARRAYS} --input_shapes ${INPUT_SHAPES} \ --output_path "${TMPDIR}/${MODEL_NAME}.tflite" \ --output_arrays ${OUTPUT_ARRAYS} > "${OUTPUT_PATH}.log" echo " " >> "${OUTPUT_PATH}.log" -python "${DRIVER_PATH}/tf2tfliteV2.py" --v2 --input_path ${INPUT_PATH} \ +python "${DRIVER_PATH}/tf2tfliteV2.py" ${TF_INTERFACE} --input_path ${INPUT_PATH} \ --input_arrays ${INPUT_ARRAYS} --input_shapes ${INPUT_SHAPES} \ --output_path "${TMPDIR}/${MODEL_NAME}.tflite" \ --output_arrays ${OUTPUT_ARRAYS} >> "${OUTPUT_PATH}.log" 2>&1 diff --git a/compiler/one-cmds/one-import-tflite b/compiler/one-cmds/one-import-tflite index 31ed5af..053489c 100644 --- a/compiler/one-cmds/one-import-tflite +++ b/compiler/one-cmds/one-import-tflite @@ -22,9 +22,16 @@ usage() { echo "Convert TensorFlow lite model to circle." echo "Usage: one-import-tflite" + echo " --version Show version information and exit" echo " --input_path " echo " --output_path " - exit 0 + exit 255 +} + +version() +{ + $DRIVER_PATH/one-version one-import-tflite + exit 255 } # Parse command-line arguments @@ -36,6 +43,9 @@ while [ "$#" -ne 0 ]; do '--help') usage ;; + '--version') + version + ;; '--input_path') export INPUT_PATH="$2" shift 2 @@ -55,12 +65,18 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then echo "Error: input model not found" echo "" usage - exit 2 fi # remove previous log rm -rf "${OUTPUT_PATH}.log" +show_err_onexit() +{ + cat "${OUTPUT_PATH}.log" +} + +trap show_err_onexit ERR + # convert .tflite to .circle echo "${DRIVER_PATH}/tflite2circle" "${INPUT_PATH}" "${OUTPUT_PATH}" > "${OUTPUT_PATH}.log" diff --git a/compiler/one-cmds/one-optimize b/compiler/one-cmds/one-optimize index 95384c1..17b6b98 100644 --- a/compiler/one-cmds/one-optimize +++ b/compiler/one-cmds/one-optimize @@ -22,6 +22,7 @@ usage() { echo "Optimize circle model." echo "Usage: one-optimize" + echo " --version Show version information and exit" echo " --all Enable all optimization algorithms" echo " --fuse_bcq Enable FuseBCQ Pass" echo " --fuse_instnorm Enable FuseInstanceNormalization Pass" @@ -33,7 +34,13 @@ usage() echo " Enable ResolveCustomOpMatMulPass Pass" echo " --input_path " echo " --output_path " - exit 0 + exit 255 +} + +version() +{ + $DRIVER_PATH/one-version one-optimize + exit 255 } OPTIMIZE_all=0 @@ -52,6 +59,9 @@ while [ "$#" -ne 0 ]; do '--help') usage ;; + '--version') + version + ;; '--all') OPTIMIZE_all=1 shift @@ -96,7 +106,6 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then echo "Error: input model not found" echo "" usage - exit 2 fi OPTIMIZE_OPTIONS="" @@ -123,6 +132,13 @@ fi # remove previous log rm -rf "${OUTPUT_PATH}.log" +show_err_onexit() +{ + cat "${OUTPUT_PATH}.log" +} + +trap show_err_onexit ERR + # NOTE do not wrap ${OPTIMIZE_OPTIONS} with "" # optimize circle echo "${DRIVER_PATH}/circle2circle" ${OPTIMIZE_OPTIONS} \ diff --git a/compiler/one-cmds/one-pack b/compiler/one-cmds/one-pack index 2bc4c60..9224b2c 100644 --- a/compiler/one-cmds/one-pack +++ b/compiler/one-cmds/one-pack @@ -22,9 +22,16 @@ usage() { echo "Package circle to nnpkg" echo "Usage: one-pack" + echo " -v, --version Show version information and exit" echo " -i " echo " -o " - exit 0 + exit 255 +} + +version() +{ + $DRIVER_PATH/one-version one-pack + exit 255 } # Parse command-line arguments @@ -36,6 +43,12 @@ while [ "$#" -ne 0 ]; do '--help') usage ;; + '-v') + version + ;; + '--version') + version + ;; '-i') export INPUT_PATH="$2" shift 2 @@ -55,12 +68,18 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then echo "Error: input model not found" echo "" usage - exit 2 fi # remove previous log rm -rf "${OUTPUT_PATH}.log" +show_err_onexit() +{ + cat "${OUTPUT_PATH}.log" +} + +trap show_err_onexit ERR + # Package circle model file to nnpkg echo "${DRIVER_PATH}/model2nnpkg.sh" -o "${OUTPUT_PATH}" "${INPUT_PATH}" > "${OUTPUT_PATH}.log" diff --git a/compiler/one-cmds/one-quantize b/compiler/one-cmds/one-quantize index ff9e266..c74b2c2 100644 --- a/compiler/one-cmds/one-quantize +++ b/compiler/one-cmds/one-quantize @@ -22,16 +22,23 @@ usage() { echo "Quantize circle model." echo "Usage: one-quantize" + echo " --version Show version information and exit" echo " --input_dtype Input data type (supported: float32, default=float32)" echo " --quantized_dtype Output quantized data type (supported: uint8, default=uint8)" - echo " --granularity Quantize granularity (supported: layer, default=layer)" + echo " --granularity Quantize granularity (supported: layer, channel, default=layer)" echo " --min_percentile Minimum percentile (0.0~100.0, default=1.0)" echo " --max_percentile Maximum percentile (0.0~100.0, default=99.0)" echo " --mode Record mode (supported: percentile/moving_average, default=percentile)" echo " --input_path " echo " --input_data " echo " --output_path " - exit 0 + exit 255 +} + +version() +{ + $DRIVER_PATH/one-version one-quantize + exit 255 } INPUT_DTYPE=float32 @@ -50,6 +57,9 @@ while [ "$#" -ne 0 ]; do '--help') usage ;; + '--version') + version + ;; '--input_dtype') INPUT_DTYPE="$2" @@ -100,13 +110,11 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then echo "Error: input model not found" echo "" usage - exit 2 fi if [ -z ${INPUT_DATA} ] || [ ! -e ${INPUT_DATA} ]; then echo "Error: input data not found" echo "" usage - exit 2 fi FILE_BASE=$(basename ${OUTPUT_PATH}) @@ -118,6 +126,13 @@ trap "{ rm -rf $TMPDIR; }" EXIT # remove previous log rm -rf "${OUTPUT_PATH}.log" +show_err_onexit() +{ + cat "${OUTPUT_PATH}.log" +} + +trap show_err_onexit ERR + # quantize circle echo "${DRIVER_PATH}/circle-quantizer" \ --quantize_dequantize_weights ${INPUT_DTYPE} ${QUANTIZED_DTYPE} ${GRANULARITY} \ diff --git a/compiler/one-cmds/requires.cmake b/compiler/one-cmds/requires.cmake index 9b858ad..812149c 100644 --- a/compiler/one-cmds/requires.cmake +++ b/compiler/one-cmds/requires.cmake @@ -3,3 +3,4 @@ require("tflite2circle") require("circle2circle") require("circle-quantizer") require("record-minmax") +require("vconone") diff --git a/compiler/record-minmax/CMakeLists.txt b/compiler/record-minmax/CMakeLists.txt index 862660e..f8a165b 100644 --- a/compiler/record-minmax/CMakeLists.txt +++ b/compiler/record-minmax/CMakeLists.txt @@ -19,9 +19,14 @@ target_link_libraries(record-minmax safemain) target_link_libraries(record-minmax luci_import) target_link_libraries(record-minmax luci_export) target_link_libraries(record-minmax luci_interpreter) +target_link_libraries(record-minmax vconone) install(TARGETS record-minmax DESTINATION bin) +if(NOT ENABLE_TEST) + return() +endif(NOT ENABLE_TEST) + nnas_find_package(GTest REQUIRED) GTest_AddTest(record_minmax_function_test "${CMAKE_CURRENT_SOURCE_DIR}/tests/RecordFunction.test.cpp") target_include_directories(record_minmax_function_test PRIVATE include) diff --git a/compiler/record-minmax/driver/Driver.cpp b/compiler/record-minmax/driver/Driver.cpp index ae4fcb7..8b09498 100644 --- a/compiler/record-minmax/driver/Driver.cpp +++ b/compiler/record-minmax/driver/Driver.cpp @@ -17,6 +17,13 @@ #include "RecordMinMax.h" #include +#include + +void print_version(void) +{ + std::cout << "record-minmax version " << vconone::get_string() << std::endl; + std::cout << vconone::get_copyright() << std::endl; +} int entry(const int argc, char **argv) { @@ -25,6 +32,13 @@ int entry(const int argc, char **argv) arser::Arser arser( "Embedding min/max values of activations to the circle model for post-training quantization"); + arser.add_argument("--version") + .nargs(0) + .required(false) + .default_value(false) + .help("Show version information and exit") + .exit_with(print_version); + arser.add_argument("--input_model") .nargs(1) .type(arser::DataType::STR) @@ -66,7 +80,7 @@ int entry(const int argc, char **argv) { std::cout << err.what() << std::endl; std::cout << arser; - return 0; + return 255; } auto input_model_path = arser.get("--input_model"); diff --git a/compiler/record-minmax/requires.cmake b/compiler/record-minmax/requires.cmake index 0545035..f6804ce 100644 --- a/compiler/record-minmax/requires.cmake +++ b/compiler/record-minmax/requires.cmake @@ -1,3 +1,4 @@ require("luci") require("safemain") require("arser") +require("vconone") diff --git a/compiler/record-minmax/src/HDF5Importer.cpp b/compiler/record-minmax/src/HDF5Importer.cpp index cf30cd8..a0e65ee 100644 --- a/compiler/record-minmax/src/HDF5Importer.cpp +++ b/compiler/record-minmax/src/HDF5Importer.cpp @@ -20,6 +20,7 @@ #include #include +#include using Shape = luci_interpreter::Shape; using DataType = luci_interpreter::DataType; diff --git a/compiler/record-minmax/src/MinMaxObserver.cpp b/compiler/record-minmax/src/MinMaxObserver.cpp index 45f0197..410ce3d 100644 --- a/compiler/record-minmax/src/MinMaxObserver.cpp +++ b/compiler/record-minmax/src/MinMaxObserver.cpp @@ -38,7 +38,8 @@ void MinMaxObserver::postTensorWrite(const luci::CircleNode *node, assert(node->opcode() != luci::CircleOpcode::UNPACK); assert(node->opcode() != luci::CircleOpcode::WHILE); - if (node->opcode() == luci::CircleOpcode::CONST) + if (node->opcode() == luci::CircleOpcode::CONST || + node->opcode() == luci::CircleOpcode::CIRCLECONST) { // node is not activation. Do nothing. return; diff --git a/compiler/record-minmax/src/RecordMinMax.cpp b/compiler/record-minmax/src/RecordMinMax.cpp index d12a0d3..17c6aa6 100644 --- a/compiler/record-minmax/src/RecordMinMax.cpp +++ b/compiler/record-minmax/src/RecordMinMax.cpp @@ -158,7 +158,7 @@ void RecordMinMax::profileData(const std::string &mode, const std::string &input auto node = iter->first; auto minmax = iter->second; - float min, max; + float min{0.0f}, max{0.0f}; if (mode == "percentile") { min = getNthPercentile(minmax.min_vector, min_percentile); diff --git a/compiler/record-minmax/tests/RecordFunction.test.cpp b/compiler/record-minmax/tests/RecordFunction.test.cpp index 13b464d..e2f135a 100644 --- a/compiler/record-minmax/tests/RecordFunction.test.cpp +++ b/compiler/record-minmax/tests/RecordFunction.test.cpp @@ -32,6 +32,8 @@ TEST(GetNthPercentileTest, Edge) EXPECT_FLOAT_NEAR(0, getNthPercentile(input, 0)); EXPECT_FLOAT_NEAR(9, getNthPercentile(input, 100)); + + SUCCEED(); } TEST(GetNthPercentileTest, Simple) @@ -47,6 +49,8 @@ TEST(GetNthPercentileTest, Simple) { EXPECT_FLOAT_NEAR(0.09 * std::floor(i) + 0.045, getNthPercentile(input, i)); } + + SUCCEED(); } TEST(GetNthPercentileTest, Float) @@ -61,6 +65,8 @@ TEST(GetNthPercentileTest, Float) EXPECT_FLOAT_NEAR(2.799942346802177, getNthPercentile(input, 1)); EXPECT_FLOAT_NEAR(7.768503955476342, getNthPercentile(input, 3.14)); EXPECT_FLOAT_NEAR(99.40456084968194, getNthPercentile(input, 99)); + + SUCCEED(); } TEST(GetNthPercentileTest, FloatWithNegative) @@ -75,6 +81,8 @@ TEST(GetNthPercentileTest, FloatWithNegative) EXPECT_FLOAT_NEAR(-47.20005765319782, getNthPercentile(input, 1)); EXPECT_FLOAT_NEAR(-42.23149604452366, getNthPercentile(input, 3.14)); EXPECT_FLOAT_NEAR(49.40456084968194, getNthPercentile(input, 99)); + + SUCCEED(); } TEST(GetNthPercentileTest, SigleElement) @@ -84,6 +92,8 @@ TEST(GetNthPercentileTest, SigleElement) EXPECT_FLOAT_NEAR(33, getNthPercentile(input, 0)); EXPECT_FLOAT_NEAR(33, getNthPercentile(input, 50)); EXPECT_FLOAT_NEAR(33, getNthPercentile(input, 100)); + + SUCCEED(); } TEST(GetNthPercentileTest, OutOfBoundary_NEG) @@ -92,6 +102,8 @@ TEST(GetNthPercentileTest, OutOfBoundary_NEG) EXPECT_THROW(getNthPercentile(input, -1), std::runtime_error); EXPECT_THROW(getNthPercentile(input, 101), std::runtime_error); + + SUCCEED(); } TEST(GetNthPercentileTest, EmptyVector_NEG) @@ -99,6 +111,8 @@ TEST(GetNthPercentileTest, EmptyVector_NEG) std::vector input; EXPECT_THROW(getNthPercentile(input, 10), std::runtime_error); + + SUCCEED(); } } // namespace record_minmax diff --git a/compiler/tfl-verify/CMakeLists.txt b/compiler/tfl-verify/CMakeLists.txt index d33059f..4421a46 100644 --- a/compiler/tfl-verify/CMakeLists.txt +++ b/compiler/tfl-verify/CMakeLists.txt @@ -6,6 +6,7 @@ file(GLOB_RECURSE SOURCES "src/*.cpp") add_executable(tfl-verify ${SOURCES}) target_include_directories(tfl-verify PRIVATE src) +target_link_libraries(tfl-verify arser) target_link_libraries(tfl-verify foder) target_link_libraries(tfl-verify mio_tflite) target_link_libraries(tfl-verify safemain) diff --git a/compiler/tfl-verify/requires.cmake b/compiler/tfl-verify/requires.cmake index ed6b84d..79503f3 100644 --- a/compiler/tfl-verify/requires.cmake +++ b/compiler/tfl-verify/requires.cmake @@ -1,3 +1,4 @@ +require("arser") require("foder") require("mio-tflite") require("safemain") diff --git a/compiler/tfl-verify/src/Driver.cpp b/compiler/tfl-verify/src/Driver.cpp index 81f6d54..6d18976 100644 --- a/compiler/tfl-verify/src/Driver.cpp +++ b/compiler/tfl-verify/src/Driver.cpp @@ -16,22 +16,31 @@ #include "VerifyFlatBuffers.h" +#include + #include #include #include int entry(int argc, char **argv) { - if (argc != 2) + arser::Arser arser; + arser.add_argument("tflite").type(arser::DataType::STR).help("TFLite file path to verify"); + + try { - std::cerr << "ERROR: Failed to parse arguments" << std::endl; - std::cerr << std::endl; - std::cerr << "USAGE: " << argv[0] << " [tflite]" << std::endl; + arser.parse(argc, argv); + } + catch (const std::runtime_error &err) + { + std::cout << err.what() << std::endl; + std::cout << arser; return 255; } + auto verifier = std::make_unique(); - std::string model_file = argv[argc - 1]; + std::string model_file = arser.get("tflite"); std::cout << "[ RUN ] Check " << model_file << std::endl; diff --git a/compiler/tflchef/core/src/ModelChef.cpp b/compiler/tflchef/core/src/ModelChef.cpp index 932a649..692ce48 100644 --- a/compiler/tflchef/core/src/ModelChef.cpp +++ b/compiler/tflchef/core/src/ModelChef.cpp @@ -413,6 +413,7 @@ template void cook_graph(const T &graph, CookParams &cp) quant_builder.add_min(quant_min); quant_builder.add_scale(quant_scale); quant_builder.add_zero_point(quant_zero_point); + quant_builder.add_quantized_dimension(quant.quantized_dimension()); // Update QuantizationParameters Index quant_index = quant_builder.Finish(); diff --git a/compiler/tflchef/proto/tflchef.proto b/compiler/tflchef/proto/tflchef.proto index 792503b..55785c3 100644 --- a/compiler/tflchef/proto/tflchef.proto +++ b/compiler/tflchef/proto/tflchef.proto @@ -35,6 +35,7 @@ message TensorQuantization { repeated float max = 2; repeated float scale = 3; repeated int64 zero_point = 4; + optional int32 quantized_dimension = 5 [default = 0]; } message Operand { diff --git a/compiler/tflchef/tflite/src/RecipeChef.cpp b/compiler/tflchef/tflite/src/RecipeChef.cpp index db62d0e..088961c 100644 --- a/compiler/tflchef/tflite/src/RecipeChef.cpp +++ b/compiler/tflchef/tflite/src/RecipeChef.cpp @@ -184,6 +184,8 @@ std::unique_ptr generate_recipe(const tflite::Model *model) for (uint32_t idx = 0; idx < quant->zero_point()->size(); ++idx) chef_quant->add_zero_point(quant->zero_point()->Get(idx)); } + tflchef::TensorQuantization *chef_quant = operand->mutable_quant(); + chef_quant->set_quantized_dimension(quant->quantized_dimension()); } } diff --git a/compiler/tflchef/tools/file/Driver.cpp b/compiler/tflchef/tools/file/Driver.cpp index cecfeeb..46e5b55 100644 --- a/compiler/tflchef/tools/file/Driver.cpp +++ b/compiler/tflchef/tools/file/Driver.cpp @@ -41,7 +41,7 @@ int entry(int argc, char **argv) { std::cout << err.what() << std::endl; std::cout << arser; - return 0; + return 255; } int32_t model_version = 1; diff --git a/compiler/tflchef/tools/reverse/Driver.cpp b/compiler/tflchef/tools/reverse/Driver.cpp index 1116dec..4d795a3 100644 --- a/compiler/tflchef/tools/reverse/Driver.cpp +++ b/compiler/tflchef/tools/reverse/Driver.cpp @@ -38,7 +38,7 @@ int entry(int argc, char **argv) { std::cout << err.what() << std::endl; std::cout << arser; - return 0; + return 255; } std::string tflite_path = arser.get("tflite"); diff --git a/compiler/tfldump/driver/Driver.cpp b/compiler/tfldump/driver/Driver.cpp index 3961d2f..38c9c06 100644 --- a/compiler/tfldump/driver/Driver.cpp +++ b/compiler/tfldump/driver/Driver.cpp @@ -33,7 +33,7 @@ int entry(int argc, char **argv) { std::cout << err.what() << '\n'; std::cout << arser; - return 0; + return 255; } std::string tflite_path = arser.get("tflite"); diff --git a/compiler/tflite2circle/CMakeLists.txt b/compiler/tflite2circle/CMakeLists.txt index a0a2e02..b1d1f61 100644 --- a/compiler/tflite2circle/CMakeLists.txt +++ b/compiler/tflite2circle/CMakeLists.txt @@ -14,5 +14,6 @@ target_link_libraries(tflite2circle arser) target_link_libraries(tflite2circle safemain) target_link_libraries(tflite2circle mio_tflite) target_link_libraries(tflite2circle mio_circle) +target_link_libraries(tflite2circle vconone) install(TARGETS tflite2circle DESTINATION bin) diff --git a/compiler/tflite2circle/driver/Driver.cpp b/compiler/tflite2circle/driver/Driver.cpp index 67b8e33..2f11e0a 100644 --- a/compiler/tflite2circle/driver/Driver.cpp +++ b/compiler/tflite2circle/driver/Driver.cpp @@ -24,10 +24,25 @@ #include "CircleModel.h" #include "TFLModel.h" +#include + +void print_version(void) +{ + std::cout << "tflite2circle version " << vconone::get_string() << std::endl; + std::cout << vconone::get_copyright() << std::endl; +} + int entry(int argc, char **argv) { arser::Arser arser{"tflite2circle is a Tensorflow lite to circle model converter"}; + arser.add_argument("--version") + .nargs(0) + .required(false) + .default_value(false) + .help("Show version information and exit") + .exit_with(print_version); + arser.add_argument("tflite") .nargs(1) .type(arser::DataType::STR) @@ -42,7 +57,7 @@ int entry(int argc, char **argv) { std::cout << err.what() << std::endl; std::cout << arser; - return 0; + return 255; } std::string tfl_path = arser.get("tflite"); diff --git a/compiler/tflite2circle/requires.cmake b/compiler/tflite2circle/requires.cmake index ff19b74..837c287 100644 --- a/compiler/tflite2circle/requires.cmake +++ b/compiler/tflite2circle/requires.cmake @@ -2,3 +2,4 @@ require("arser") require("mio-tflite") require("mio-circle") require("safemain") +require("vconone") diff --git a/compiler/vconone/CMakeLists.txt b/compiler/vconone/CMakeLists.txt new file mode 100644 index 0000000..b8cb793 --- /dev/null +++ b/compiler/vconone/CMakeLists.txt @@ -0,0 +1,31 @@ +if (NOT VCONONE_VERSION) + set(VCONONE_VERSION 0x0000000000080001) + # NOTE order is [build patch minor major] + # if VCONONE_VERSION is set with -D option, it will be cached + # you may have to remove cache file if you remove -D option +endif() + +configure_file(version_cfg.h.in version_cfg.h @ONLY) + +set(DRIVER "driver/driver.cpp") + +file(GLOB_RECURSE SOURCES "src/*.cpp") +file(GLOB_RECURSE TESTS "src/*.test.cpp") +list(REMOVE_ITEM SOURCES ${TESTS}) + +add_library(vconone STATIC ${SOURCES}) +target_include_directories(vconone PUBLIC include) +target_include_directories(vconone PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) + +add_executable(one-version ${DRIVER}) +target_link_libraries(one-version vconone) +install(TARGETS one-version DESTINATION bin) + +if(NOT ENABLE_TEST) + return() +endif(NOT ENABLE_TEST) + +nnas_find_package(GTest REQUIRED) + +GTest_AddTest(vconone_test ${TESTS}) +target_link_libraries(vconone_test vconone) diff --git a/compiler/vconone/README.md b/compiler/vconone/README.md new file mode 100644 index 0000000..c08dd63 --- /dev/null +++ b/compiler/vconone/README.md @@ -0,0 +1,14 @@ +# vconone + +_vconone_ provides version number and strings for one-* commands and command +line tools + +# Revise version number + +To revise version number, update `VCONONE_VERSION` in `CmakeLists.txt` +or give `-DVCONONE_VERSION=0x0000000100080001` at cmake configure step. + +Number given is four numbers `build`, `patch`, `minor` and `major` in order for +each 16bit integers. `build` is not used for now. + +`0x0000000100080001` version is interpretered as `1.8.1` diff --git a/compiler/vconone/driver/driver.cpp b/compiler/vconone/driver/driver.cpp new file mode 100644 index 0000000..12bd0ee --- /dev/null +++ b/compiler/vconone/driver/driver.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +int main(int argc, char *argv[]) +{ + auto str = vconone::get_string(); + if (argc >= 2) + { + for (int c = 1; c < argc; ++c) + std::cout << argv[c] << " "; + std::cout << "version " << str << std::endl; + std::cout << vconone::get_copyright() << std::endl; + } + else + std::cout << str; + + return 0; +} diff --git a/compiler/vconone/include/vconone/vconone.h b/compiler/vconone/include/vconone/vconone.h new file mode 100644 index 0000000..a6a1998 --- /dev/null +++ b/compiler/vconone/include/vconone/vconone.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __VCON_ONE_H__ +#define __VCON_ONE_H__ + +#include +#include + +namespace vconone +{ + +struct four +{ + uint16_t major; + uint16_t minor; + uint16_t patch; + uint16_t build; // build is not used for now +}; + +union version { + uint64_t v; + four f; +}; + +/** + * @brief get_number will return version union structure + */ +version get_number(void); + +/** + * @brief get_string will return string of major.minor.patch (without build) + */ +std::string get_string(void); + +/** + * @brief get_string4 will return string of major.minor.patch.build + */ +std::string get_string4(void); + +/** + * @brief get_copyright will return copyright string + */ +std::string get_copyright(void); + +} // namespace vconone + +#endif // __VCON_ONE_H__ diff --git a/compiler/vconone/src/version.cpp b/compiler/vconone/src/version.cpp new file mode 100644 index 0000000..9b693c6 --- /dev/null +++ b/compiler/vconone/src/version.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "vconone/vconone.h" + +#include "version_cfg.h" + +#include + +namespace vconone +{ + +version get_number(void) +{ + version v; + v.v = VCONONE_VERSION; + return v; +} + +std::string get_string4(void) +{ + std::ostringstream ss; + + auto v = get_number(); + ss << unsigned(v.f.major) << "." << unsigned(v.f.minor) << "." << unsigned(v.f.patch) << "." + << unsigned(v.f.build); + + return ss.str(); +} + +std::string get_string(void) +{ + std::ostringstream ss; + + auto v = get_number(); + ss << unsigned(v.f.major) << "." << unsigned(v.f.minor) << "." << unsigned(v.f.patch); + + return ss.str(); +} + +std::string get_copyright(void) +{ + std::string str; + str = "Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved\r\n"; + str += "Licensed under the Apache License, Version 2.0\r\n"; + str += "https://github.com/Samsung/ONE"; + return str; +} + +} // namespace vconone diff --git a/compiler/vconone/src/version.test.cpp b/compiler/vconone/src/version.test.cpp new file mode 100644 index 0000000..35a0647 --- /dev/null +++ b/compiler/vconone/src/version.test.cpp @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +TEST(vconone, version_number) +{ + auto v = vconone::get_number(); + + ASSERT_NE(0x0000000000000000ULL, v.v); +} + +TEST(vconone, version_string) +{ + auto str = vconone::get_string(); + + ASSERT_NE("..", str); + ASSERT_NE("", str); +} + +TEST(vconone, version_string4) +{ + auto str = vconone::get_string4(); + + ASSERT_NE("...", str); + ASSERT_NE("", str); +} + +TEST(vconone, copyright) +{ + auto str = vconone::get_copyright(); + + ASSERT_NE("", str); +} diff --git a/compiler/vconone/version_cfg.h.in b/compiler/vconone/version_cfg.h.in new file mode 100644 index 0000000..aa3ad9e --- /dev/null +++ b/compiler/vconone/version_cfg.h.in @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __VCON_ONE_VERSION_CFG_H__ +#define __VCON_ONE_VERSION_CFG_H__ + +#define VCONONE_VERSION @VCONONE_VERSION@ULL + +#endif // __VCON_ONE_VERSION_CFG_H__ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h deleted file mode 100644 index 9699b5c..0000000 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/** - * @file CLArgOperationKernel.h - * @brief This file defines CLArgOperationKernel - * @ingroup COM_AI_RUNTIME - */ - -#ifndef __ARM_COMPUTE_CLARGOPERATIONKERNEL_H__ -#define __ARM_COMPUTE_CLARGOPERATIONKERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" -#include "arm_compute/core/TypesEx.h" - -namespace arm_compute -{ -class ICLTensor; - -/** - * @brief Class to define interface for the argop kernel. - */ -class CLArgOperationKernel : public ICLKernel -{ -public: - /** - * @brief Default constructor. - */ - CLArgOperationKernel(); - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - * @param [in] copiedInstance Const reference of CLArgOperationKernel to be copied - */ - CLArgOperationKernel(const CLArgOperationKernel &) = delete; - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - * @param [in] copiedInstance Const reference of CLArgOperationKernel to be copied - * @return Reference of this instance - */ - CLArgOperationKernel &operator=(const CLArgOperationKernel &) = delete; - /** - * @brief Allow instances of this class to be moved - * @param [in] movedInstance Rvalue reference of CLArgOperationKernel to be moved - */ - CLArgOperationKernel(CLArgOperationKernel &&) = default; - /** - * @brief Allow instances of this class to be moved - * @param [in] movedInstance Rvalue reference of CLArgOperationKernel to be moved - * @return Reference of this instance - */ - CLArgOperationKernel &operator=(CLArgOperationKernel &&) = default; - /** - * @brief Initialise the kernel's input, output and border mode. - * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32. - * @param[out] output The output tensor, Data types supported: S32. - * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. - * @param[in] op Arg operation to perform. - * return N/A - */ - void configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis, ArgOperation op); - /** - * @brief Static function to check if given info will lead to a valid configuration of @ref - * CLArgOperationKernel - * @param[in] input An input tensor info. Data types supported: U8/QASYMM8/S32/F32. - * @param[in] output The output tensor info, Data types supported: S32. - * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. - * @param[in] op Arg operation to perform. - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, - ArgOperation op); - - /* - * @brief Run CLArgOperationKernel op - * @param[in] window Window to be used for in_slice - * @param[in] queue cl::CommandQueue - * @return N/A - */ - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; - ICLTensor *_output; - uint32_t _axis; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLARGOPERATIONKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h deleted file mode 100644 index b0357fe..0000000 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/** - * @file CLCastKernel.h - * @ingroup COM_AI_RUNTIME - * @brief This file defines CLCastKernel class - */ - -#ifndef __ARM_COMPUTE_CLCASTKERNEL_H__ -#define __ARM_COMPUTE_CLCASTKERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" -#include "arm_compute/core/TypesEx.h" - -namespace arm_compute -{ -class ICLTensor; - -/** - * @brief Class to define OpenCL kernel for cast operation - */ -class CLCastKernel : public ICLKernel -{ -public: - /** - * @brief Construct CLCastKernel object - */ - CLCastKernel(); - - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers) - */ - CLCastKernel(const CLCastKernel &) = delete; - - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers) - */ - CLCastKernel &operator=(const CLCastKernel &) = delete; - - /** - * @brief Construct CLCastKernel object using default move constructor - * @param[in] CLCastKernel object to move - */ - CLCastKernel(CLCastKernel &&) = default; - - /** - * @brief Allow instances of this class to be moved - * @param[in] CLCastKernel object to move - */ - CLCastKernel &operator=(CLCastKernel &&) = default; - - /** - * @brief Destruct this CLCastKernel object - */ - ~CLCastKernel() = default; - - /** - * @brief Initialise the kernel's input and output. - * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * @param[in] input_subtype Sub data type of input. - * @return N/A - */ - void configure(const ICLTensor *input, ICLTensor *output, SubDataType input_subtype); - - /** - * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command - * queue. - * @note The queue is *not* flushed by this method, and therefore the kernel will not have - * been executed by the time this method returns. - * @param[in] window Region on which to execute the kernel. (Must be a valid region of - * the window returned by window()). - * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A - * @return N/A - */ - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; /**< Source tensor */ - ICLTensor *_output; /**< Destination tensor */ -}; -} // namespace arm_compute -#endif /* __ARM_COMPUTE_CLCASTKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h deleted file mode 100644 index 8615cf1..0000000 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ -#define __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to perform depthTospace operation */ -class CLDepthToSpaceKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLDepthToSpaceKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLDepthToSpaceKernel(const CLDepthToSpaceKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLDepthToSpaceKernel &operator=(const CLDepthToSpaceKernel &) = delete; - /** Allow instances of this class to be moved */ - CLDepthToSpaceKernel(CLDepthToSpaceKernel &&) = default; - /** Allow instances of this class to be moved */ - CLDepthToSpaceKernel &operator=(CLDepthToSpaceKernel &&) = default; - /** Default destructor */ - ~CLDepthToSpaceKernel() = default; - /** Initialise the kernel's input and output. - * - * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - */ - void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; /**< Source tensor */ - ICLTensor *_output; /**< Destination tensor */ -}; - -} // namespace arm_compute -#endif /* __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h deleted file mode 100644 index 9321c36..0000000 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNELEX_H__ -#define __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNELEX_H__ - -#include "arm_compute/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to multiply matrices - * - * @note This kernel should be used ONLY for Midgard architectures - * - * This kernel performs the following computation: - * - * -# Convert a values from int8 to int32 - * -# Convert b values from int8 to int32 - * -# Compute the int32 matrix product of the resulting a * b and store the result as int32 - * - */ -class CLGEMMLowpMatrixMultiplyKernelEx : public ICLKernel -{ -public: - /** Default Constructor */ - CLGEMMLowpMatrixMultiplyKernelEx(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGEMMLowpMatrixMultiplyKernelEx(const CLGEMMLowpMatrixMultiplyKernelEx &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGEMMLowpMatrixMultiplyKernelEx &operator=(const CLGEMMLowpMatrixMultiplyKernelEx &) = delete; - /** Allow instances of this class to be moved */ - CLGEMMLowpMatrixMultiplyKernelEx(CLGEMMLowpMatrixMultiplyKernelEx &&) = default; - /** Allow instances of this class to be moved */ - CLGEMMLowpMatrixMultiplyKernelEx &operator=(CLGEMMLowpMatrixMultiplyKernelEx &&) = default; - /** Initialise the kernel's input and output. - * - * @note This kernel should be used ONLY for Midgard architectures - * - * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8 - * @param[in] input1 Input tensor containing the RHS matrix. Data type supported: same as @p - * input0 - * @param[out] output Output tensor to store the result of matrix multiplication. Data type - * supported: S32 - * @param[in] gemm_info (Optional) GEMM information used to retrieve the original dimensions of - * the input matrices - */ - void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, - const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref - * CLGEMMLowpMatrixMultiplyKernelEx - * - * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8 - * @param[in] input1 Input tensor containing the RHS matrix. Data type supported: same as @p - * input0 - * @param[in] output Output tensor to store the result of matrix multiplication. Data type - * supported: S32 - * @param[in] gemm_info (Optional) GEMM information used to retrieve the original dimensions of - * the input matrices - * - * @return a status - */ - static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, - const ITensorInfo *output, - const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo()); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input0; - const ICLTensor *_input1; - ICLTensor *_output; - bool _slide_matrix_b; - bool _reinterpret_input_as_3d; - bool _reinterpret_output_as_3d; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNELEX_H__*/ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h deleted file mode 100644 index dd2dbf6..0000000 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_CLPRELU_KERNEL_H__ -#define __ARM_COMPUTE_CLPRELU_KERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to calculate PReLU*/ -class CLPReLUKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLPReLUKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers). */ - CLPReLUKernel(const CLPReLUKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers). */ - CLPReLUKernel &operator=(const CLPReLUKernel &) = delete; - /** Allow instances of this class to be moved */ - CLPReLUKernel(CLPReLUKernel &&) = default; - /** Allow instances of this class to be moved */ - CLPReLUKernel &operator=(CLPReLUKernel &&) = default; - /** Initialize the kernel's input, output. - * - * @param[in] input Source tensor1. - * @param[in] alpha Source tensor2. - * @param[out] output Output tensor. - */ - void configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - - BorderSize border_size() const override; - -private: - const ICLTensor *_input; - const ICLTensor *_alpha; - ICLTensor *_output; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLPRELU_KERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h deleted file mode 100644 index 4c0a82c..0000000 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ -#define __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to perform spaceTodepth operation */ -class CLSpaceToDepthKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLSpaceToDepthKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLSpaceToDepthKernel(const CLSpaceToDepthKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLSpaceToDepthKernel &operator=(const CLSpaceToDepthKernel &) = delete; - /** Allow instances of this class to be moved */ - CLSpaceToDepthKernel(CLSpaceToDepthKernel &&) = default; - /** Allow instances of this class to be moved */ - CLSpaceToDepthKernel &operator=(CLSpaceToDepthKernel &&) = default; - /** Default destructor */ - ~CLSpaceToDepthKernel() = default; - /** Initialise the kernel's input and output. - * - * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - */ - void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; /**< Source tensor */ - ICLTensor *_output; /**< Destination tensor */ -}; - -} // namespace arm_compute -#endif /* __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h deleted file mode 100644 index 9d174de..0000000 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__ -#define __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Interface for the Upsampling layer kernel for transpose convolution on OpenCL. - */ -class CLTransposeConvLayerUpsampleKernel : public ICLKernel -{ -public: - /** Constructor */ - CLTransposeConvLayerUpsampleKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLTransposeConvLayerUpsampleKernel(const CLTransposeConvLayerUpsampleKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLTransposeConvLayerUpsampleKernel & - operator=(const CLTransposeConvLayerUpsampleKernel &) = delete; - /** Default Move Constructor. */ - CLTransposeConvLayerUpsampleKernel(CLTransposeConvLayerUpsampleKernel &&) = default; - /** Default move assignment operator */ - CLTransposeConvLayerUpsampleKernel &operator=(CLTransposeConvLayerUpsampleKernel &&) = default; - /** Default destructor */ - ~CLTransposeConvLayerUpsampleKernel() = default; - - /** Initialise the kernel's input and output. - * - * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32. - * @param[out] output Destination tensor. Data types supported: same as @p input. All but - * the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only - * performed within the XY-plane. - * @param[in] inner_border Top and right inner border sizes. These rows and columns will be - * filled with zero. - * @param[in] info Contains padding and stride information described in @ref - * PadStrideInfo. - */ - void configure(const ICLTensor *input, ICLTensor *output, const BorderSize &inner_border, - const PadStrideInfo &info); - /** Static function to check if given info will lead to a valid configuration of @ref - * CLTransposeConvLayerUpsample - * - * @param[in] input Source tensor info. Data types supported: QASYMM8/F16/F32. - * @param[in] output Destination tensor info. Data types supported: same as @p input. All - * but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is - * only performed within the XY-plane. - * @param[in] inner_border Top and right inner border sizes. These rows and columns will be filled - * with zero. - * @param[in] info Contains padding and stride information described in @ref - * PadStrideInfo. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const BorderSize &inner_border, const PadStrideInfo &info); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; - ICLTensor *_output; - BorderSize _inner_border; - PadStrideInfo _info; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h deleted file mode 100644 index d4c9c61..0000000 --- a/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__ -#define __ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__ - -#include "arm_compute/core/CPP/ICPPKernel.h" - -namespace arm_compute -{ -class ITensor; - -/** CPP kernel to perform tensor upsample. - * - */ -class CPPUpsampleKernelEx : public ICPPKernel -{ -public: - const char *name() const override { return "CPPUpsampleKernelEx"; } - /** Default constructor */ - CPPUpsampleKernelEx(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CPPUpsampleKernelEx(const CPPUpsampleKernelEx &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CPPUpsampleKernelEx &operator=(const CPPUpsampleKernelEx &) = delete; - /** Allow instances of this class to be moved */ - CPPUpsampleKernelEx(CPPUpsampleKernelEx &&) = default; - /** Allow instances of this class to be moved */ - CPPUpsampleKernelEx &operator=(CPPUpsampleKernelEx &&) = default; - /** Default destructor */ - ~CPPUpsampleKernelEx() = default; - - /** Set the input and output of the kernel. - * - * @param[in] input The input tensor to upsample. Data types supported: F32/F16/QASYMM8 - * @param[out] output The output tensor. Data types supported: Same as @p input - * @param[in] info Padding info. - */ - void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - bool is_parallelisable() const override; - -private: - const ITensor *_input; - ITensor *_output; - PadStrideInfo _info; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h deleted file mode 100644 index 4e9f097..0000000 --- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_NECASTKERNEL_H__ -#define __ARM_COMPUTE_NECASTKERNEL_H__ - -#include "arm_compute/core/NEON/INEKernel.h" -#include "arm_compute/core/TypesEx.h" - -namespace arm_compute -{ -class ITensor; - -/** Interface for the cast layer kernel. */ -class NECastKernel : public INEKernel -{ -public: - const char *name() const override { return "NECastKernel"; } - /** Default constructor */ - NECastKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NECastKernel(const NECastKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NECastKernel &operator=(const NECastKernel &) = delete; - /** Default Move Constructor. */ - NECastKernel(NECastKernel &&) = default; - /** Default move assignment operator */ - NECastKernel &operator=(NECastKernel &&) = default; - /** Default destructor */ - ~NECastKernel() = default; - /** Set input, output tensors. - * - * @param[in] input Source tensor. Data type supported: U8/S8/QASYMM8/U32/S32/F32. - * @param[out] output Destination tensor with the same dimensions of input. Data type supported: - * U8/S8/QASYMM8/U32/S32/F32. - * @param[in] input_subtype Sub data type of input. - */ - void configure(const ITensor *input, ITensor *output, SubDataType input_subtype); - /** Static function to check if given info will lead to a valid configuration of @ref NECastKernel - * - * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32. - * @param[in] output Output tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32. - * @param[in] input_subtype Sub data type of input. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - SubDataType input_subtype); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -private: - const ITensor *_input; - ITensor *_output; - SubDataType _input_subtype; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_NECASTKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h deleted file mode 100644 index b62897e..0000000 --- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__ -#define __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__ - -#include "arm_compute/core/NEON/INEKernel.h" - -namespace arm_compute -{ -class ITensor; - -/** Interface for the depth to space kernel */ -class NEDepthToSpaceLayerKernelEx : public INEKernel -{ -public: - const char *name() const override { return "NEDepthToSpaceLayerKernelEx"; } - /** Default constructor */ - NEDepthToSpaceLayerKernelEx(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEDepthToSpaceLayerKernelEx(const NEDepthToSpaceLayerKernelEx &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEDepthToSpaceLayerKernelEx &operator=(const NEDepthToSpaceLayerKernelEx &) = delete; - /** Allow instances of this class to be moved */ - NEDepthToSpaceLayerKernelEx(NEDepthToSpaceLayerKernelEx &&) = default; - /** Allow instances of this class to be moved */ - NEDepthToSpaceLayerKernelEx &operator=(NEDepthToSpaceLayerKernelEx &&) = default; - /** Default destructor */ - ~NEDepthToSpaceLayerKernelEx() = default; - /** Initialise the kernel's inputs and output. - * - * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: - * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. - * @param[out] output Tensor output. Data types supported: same as @p input - * @param[in] block_shape Block shape x value. - */ - void configure(const ITensor *input, ITensor *output, int32_t block_shape); - /** Static function to check if given info will lead to a valid configuration of @ref - * NEDepthToSpaceLayerKernelEx. - * - * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: - * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. - * @param[in] output Tensor output info. Data types supported: same as @p input - * @param[in] block_shape Block shape value. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -private: - const ITensor *_input; /**< Source tensor */ - ITensor *_output; /**< Destination tensor */ - int32_t _block_shape; /**< Block shape */ -}; -} // namespace arm_compute -#endif /* __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h deleted file mode 100644 index 57de78d..0000000 --- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2018-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__ -#define __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__ - -#include "arm_compute/core/NEON/INEKernel.h" -#include "arm_compute/core/TypesEx.h" - -namespace arm_compute -{ -class ITensor; - -/** Interface for an element-wise unary operation kernel - * - * Element-wise operation is computed by: - * @f[ output(x) = OP(input(x))@f] - * - */ -class NEElementwiseUnaryKernelEx : public INEKernel -{ -public: - const char *name() const override { return "NEElementwiseUnaryKernelEx"; } - /** Default constructor */ - NEElementwiseUnaryKernelEx(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEElementwiseUnaryKernelEx(const NEElementwiseUnaryKernelEx &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEElementwiseUnaryKernelEx &operator=(const NEElementwiseUnaryKernelEx &) = delete; - /** Allow instances of this class to be moved */ - NEElementwiseUnaryKernelEx(NEElementwiseUnaryKernelEx &&) = default; - /** Allow instances of this class to be moved */ - NEElementwiseUnaryKernelEx &operator=(NEElementwiseUnaryKernelEx &&) = default; - /** Default destructor */ - ~NEElementwiseUnaryKernelEx() = default; - - /** Static function to check if given info will lead to a valid configuration of @ref - * NEElementwiseUnaryKernelEx - * - * @param[in] op Arithmetic operation to be executed. - * @param[in] input First tensor input. Data types supported: F16/F32/S32. - * @param[in] output Output tensor. Data types supported: Same as @p input. - */ - void configure(ElementWiseUnaryEx op, const ITensor *input, ITensor *output); - - /** Static function to check if given info will lead to a valid configuration of @ref - * NEElementwiseUnaryKernelEx - * - * @param[in] op Arithmetic operation to be executed. - * @param[in] input First tensor input info. Data types supported: F16/F32/S32. - * @param[in] output Output tensor info. Data types supported: Same as @p input. - * - * @return a Status - */ - static Status validate(ElementWiseUnaryEx op, const ITensorInfo *input, - const ITensorInfo *output); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - - /** Common signature for all the specialised arithmetic functions - * - * @param[in] input An input tensor. Data types supported: F16/F32/S32. - * @param[out] output The output tensor. Data types supported: Same as @p input. - * @param[in] window Region on which to execute the kernel. - */ - using ElementwiseUnaryFunction = void(const ITensor *input, ITensor *output, - const Window &window); - -protected: - // Inherited methods overridden: - static Status validate_arguments(const ITensorInfo &input, const ITensorInfo &output); - - /** Function to use for the particular tensor types passed to configure() */ - std::function _function; - - const ITensor *_input; - ITensor *_output; -}; -} // namespace arm_compute -#endif /* __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h deleted file mode 100644 index 722efd3..0000000 --- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_NEPRELUKERNEL_H__ -#define __ARM_COMPUTE_NEPRELUKERNEL_H__ - -#include "arm_compute/core/NEON/INEKernel.h" - -namespace arm_compute -{ -class ITensor; - -/** Interface for the kernel to perform Parametric Rectified Linear Unit - * - * Result is computed by: - * @f[ output(x) = alpha * x for x < 0, output(x) = x for x >= 0 @f] - */ -class NEPReLUKernel : public INEKernel -{ -public: - const char *name() const override { return "NEPReLUKernel"; } - /** Default constructor */ - NEPReLUKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEPReLUKernel(const NEPReLUKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEPReLUKernel &operator=(const NEPReLUKernel &) = delete; - /** Allow instances of this class to be moved */ - NEPReLUKernel(NEPReLUKernel &&) = default; - /** Allow instances of this class to be moved */ - NEPReLUKernel &operator=(NEPReLUKernel &&) = default; - /** Initialise the kernel's inputs and output - * - * @param[in] input Input tensor. Data type supported: QASYMM8/F32 - * @param[in] alpha Alpha tensor. Data types supported: Same as @p input - * @param[out] output Output tensor. Data types supported: Same as @p input - */ - void configure(const ITensor *input, const ITensor *alpha, ITensor *output); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - - /** Static function to check if given info will lead to a valid configuration of @ref - * NEPReLUKernel.h - * - * @param[in] input Input tensor input info. Data types supported: QASYMM8/F32. - * @param[in] alpha Alpha tensor input info. Data types supported: Same as @p input. - * @param[in] output Output tensor info. Data types supported: Same as @p input. - * - * @return a Status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *alpha, - const ITensorInfo *output); - static Status validate_arguments(const ITensorInfo &input, const ITensorInfo &alpha, - const ITensorInfo &output); - -private: - const ITensor *_input; /**< Source tensor */ - const ITensor *_alpha; /**< Alpha tensor */ - ITensor *_output; /**< Destination tensor */ -}; -} // namespace arm_compute -#endif /* __ARM_COMPUTE_NEPRELUKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h deleted file mode 100644 index 0ffcf6b..0000000 --- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__ -#define __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__ - -#include "arm_compute/core/NEON/INEKernel.h" -#include "arm_compute/core/Types.h" - -namespace arm_compute -{ -class ITensor; - -/** Interface for the space to depth kernel */ -class NESpaceToDepthLayerKernelEx : public INEKernel -{ -public: - const char *name() const override { return "NESpaceToDepthLayerKernelEx"; } - /** Default constructor */ - NESpaceToDepthLayerKernelEx(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NESpaceToDepthLayerKernelEx(const NESpaceToDepthLayerKernelEx &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NESpaceToDepthLayerKernelEx &operator=(const NESpaceToDepthLayerKernelEx &) = delete; - /** Allow instances of this class to be moved */ - NESpaceToDepthLayerKernelEx(NESpaceToDepthLayerKernelEx &&) = default; - /** Allow instances of this class to be moved */ - NESpaceToDepthLayerKernelEx &operator=(NESpaceToDepthLayerKernelEx &&) = default; - /** Default destructor */ - ~NESpaceToDepthLayerKernelEx() = default; - /** Initialise the kernel's inputs and output. - * - * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: - * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. - * @param[out] output Tensor output. Data types supported: same as @p input - * @param[in] block_shape Block shape value - */ - void configure(const ITensor *input, ITensor *output, int32_t block_shape); - /** Static function to check if given info will lead to a valid configuration of @ref - * NESpaceToDepthLayerKernelEx - * - * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: - * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. - * @param[in] output Tensor output info. Data types supported: same as @p input - * @param[in] block_shape Block shape value - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -private: - const ITensor *_input; /**< Source tensor */ - ITensor *_output; /**< Destination tensor */ - int32_t _block_shape; /**< Block shape */ -}; -} // namespace arm_compute -#endif /* __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h index 97bc4ce..cfbd134 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h @@ -16,25 +16,14 @@ #ifndef __ARM_COMPUTE_CLFUNCTIONSEX_H__ #define __ARM_COMPUTE_CLFUNCTIONSEX_H__ -#include -#include #include -#include -#include #include #include #include #include #include -#include #include -#include -#include #include -#include -#include -#include -#include #include #include diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h deleted file mode 100644 index c37096f..0000000 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/** - * @file CLArgOperation.h - * @ingroup COM_AI_RUNTIME - * @brief This file contains arm_compute::CLArgOperation class - */ - -#ifndef __ARM_COMPUTE_CLARGOPERATION_H__ -#define __ARM_COMPUTE_CLARGOPERATION_H__ - -#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h" -#include "arm_compute/runtime/CL/CLTensor.h" -#include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/TypesEx.h" - -namespace arm_compute -{ -class ICLTensor; - -/** - * @brief Class to execute CLArgOperation operation - */ -class CLArgOperation : public IFunction -{ -public: - /** - * @brief Construct a new CLArgOperation object - */ - CLArgOperation(); - - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers) - */ - CLArgOperation(const CLArgOperation &) = delete; - - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers) - */ - CLArgOperation &operator=(const CLArgOperation &) = delete; - - /** - * @brief Construct a new CLArgOperation object by using copy constructor - * @param[in] CLArgOperation object to move - */ - CLArgOperation(CLArgOperation &&) = default; - - /** - * @brief Assign a CLArgOperation object. - * @param[in] CLArgOperation object to assign. This object will be moved. - */ - CLArgOperation &operator=(CLArgOperation &&) = default; - - /** - * @brief Initialise the kernel's inputs and outputs. - * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S32/F32. - * @param[out] output The result of arg operation. Data types supported: S32. - * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. - * @param[in] op Arg operation to perform. - * @return N/A - */ - void configure(ICLTensor *input, ICLTensor *output, std::vector axis, ArgOperation op); - - /** - * @brief Static function to check if given info will lead to a valid configuration - * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S32/F32. - * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. - * @param[out] output The result of arg operation. Data types supported: S32. - * @param[in] op Arg operation to perform. - * @return a status - */ - static Status validate(const ITensorInfo *input, const std::vector &axis, - const ITensorInfo *output, ArgOperation op); - /** - * @brief Run the OpenCL kernel for this operation - * @return N/A - */ - void run() override; - -private: - ICLTensor *_input{nullptr}; - ICLTensor *_output{nullptr}; - std::vector _axis{}; - ArgOperation _arg_op{ArgOperation::MAX}; - - std::unique_ptr _interm_tensors{nullptr}; - std::unique_ptr _argop_kernels{nullptr}; - size_t _num_of_kernels{0}; -}; -} -#endif /*__ARM_COMPUTE_CLARGOPERATION_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h deleted file mode 100644 index eed5cb8..0000000 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ -#define __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Basic function to run @ref CLBatchToSpaceNDKernel - * - * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. - * @note The function converts the input tensor to the tensor of the output tensor's type. - */ -class CLBatchToSpaceND : public ICLSimpleFunction -{ -public: - /** Initialise the kernel's input and output. - * - * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * @param[in] block_size A pointer to an array of integer values specifying block sizes - * for spatial dimension. - */ - void configure(ICLTensor *input, ICLTensor *output, const int32_t *block_size); -}; - -} // namespace arm_compute -#endif /* __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h deleted file mode 100644 index ebe0d8a..0000000 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/** - * @file CLCast.h - * @ingroup COM_AI_RUNTIME - * @brief This file contains arm_compute::CLCast class - */ - -#ifndef __ARM_COMPUTE_CLCAST_H__ -#define __ARM_COMPUTE_CLCAST_H__ - -#include "arm_compute/core/TypesEx.h" -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -/** - * @brief Class to run @ref CLCastKernel. - * This converts the input tensor to the tensor of the output tensor's type. - */ -class CLCast : public ICLSimpleFunction -{ -public: - /** - * @brief Initialise the kernel's input and output - * @param[in, out] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * The input tensor is [in, out] because its TensorInfo might be - * modified inside the kernel. - * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * @param[in] input_subtype Sub data type of input. - */ - void configure(ICLTensor *input, ICLTensor *output, SubDataType input_subtype); -}; -} -#endif /* __ARM_COMPUTE_CLCAST_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h deleted file mode 100644 index d52a538..0000000 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_CLDEPTHTOSPACE_H__ -#define __ARM_COMPUTE_CLDEPTHTOSPACE_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Basic function to run @ref CLDepthToSpaceKernel - * - * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. - * @note The function converts the input tensor to the tensor of the output tensor's type. - */ -class CLDepthToSpace : public ICLSimpleFunction -{ -public: - /** Initialise the kernel's input and output. - * - * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * @param[block_size] block size integer only - */ - void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size); -}; -} // namesace arm_compute - -#endif /* __ARM_COMPUTE_CLDEPTHTOSPACE_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h new file mode 100644 index 0000000..409eaf5 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__ +#define __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__ + +#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h" +#include "arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h" +#include "arm_compute/runtime/CL/functions/CLReverse.h" +#include "arm_compute/runtime/CL/functions/CLTranspose.h" + +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" + +#include + +namespace arm_compute +{ +class ICLTensor; +/** Function to run the deconvolution layer. + * + * Deconvolution Layer is the backward pass of Convolution Layer. First we transform the input + * depending on the stride and pad info and then perform a 1x1 + * convolution pass. Input stride defines how many zeroes we should put between each element of the + * input and pad is the amount of padding. + * + * The relation between input to output is as follows: + * \f[ + * width\_output = (width\_input - 1) \cdot stride\_x - 2 \cdot padding\_x + kernel\_x + * \f] + * \f[ + * height\_output = (height\_input - 1) \cdot stride\_y - 2 \cdot padding\_y + kernel\_y + * \f] + * + * where: + * width_input is the size of the first input dimension. + * height_input is the size of the second input dimension. + * width_output is the size of the first output dimension. + * height_output is the size of the second output dimension. + * kernel_x and kernel_y are the convolution sizes in x and y. + * stride_x and stride_y is the input stride of the first and second dimension. + * + * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution. + * Therefore, it will be necessary to use the weights in the + * reverse order to perform an actual convolution. This is achieved by using @ref CLReverse. + * + * This function calls the following OpenCL kernels/functions: + * + * -# @ref CLDeconvolutionLayerUpsample + * -# @ref CLConvolutionLayer + * + * And the following CPP kernels: + * -# @ref CLReverse + * + */ +class CLDirectTransposeConvLayer : public IFunction +{ +public: + /** Constructor */ + CLDirectTransposeConvLayer(std::shared_ptr memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLDirectTransposeConvLayer(const CLDirectTransposeConvLayer &) = delete; + /** Default move constructor */ + CLDirectTransposeConvLayer(CLDirectTransposeConvLayer &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLDirectTransposeConvLayer &operator=(const CLDirectTransposeConvLayer &) = delete; + /** Default move assignment operator */ + CLDirectTransposeConvLayer &operator=(CLDirectTransposeConvLayer &&) = default; + /** Set the input, weights, biases and output tensors. + * + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an + * optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type + * supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data type supported: Should match @p input data type, except for + * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type + * @param[out] output Output tensor. The output has the same number of dimensions as the + * @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, this + * is decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, + * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. + * + */ + void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, + const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom, + const WeightsInfo &weights_info = WeightsInfo()); + /** Set the input, weights, biases and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and + * an optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data + * type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data type supported: Should match @p input data type, except for + * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type + * @param[out] output Output tensor. The output has the same number of dimensions as + * the @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, + * this is decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref + * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref + * CLWeightsReshapeKernel. + * + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, + const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info, + unsigned int invalid_right, unsigned int invalid_bottom, + const WeightsInfo &weights_info = WeightsInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLDirectTransposeConvLayer + * + * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an + * optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data + * type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data type supported: Should match @p input data type, except for input + * of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type + * @param[in] output Output tensor info. The output has the same number of dimensions as the + * @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, this is + * decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, + * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info, + unsigned int invalid_right, unsigned int invalid_bottom, + const WeightsInfo &weights_info = WeightsInfo()); + + // Inherited methods overridden: + void run() override; + void prepare() override; + +private: + MemoryGroup _memory_group; + CLDeconvolutionLayerUpsample _scale_f; + CLConvolutionLayer _conv_f; + CLReverse _flip_weights; + + CLTensor _scaled_output; + ICLTensor *_original_weights; + CLTensor _weights_flipped; + CLTensor _flip_axis; + + bool _is_prepared; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h index 1a0284a..f3266f6 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h @@ -50,7 +50,7 @@ #include "arm_compute/core/CL/kernels/CLTransposeKernel.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/CL/CLTensor.h" -#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h" +#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" namespace arm_compute { @@ -168,7 +168,7 @@ private: CLFullyConnectedHybridLayerReshapeWeights _reshape_weights_kernel; CLScaleFactorSymm8Kernel _scale_factor_kernel; CLQuantizationSymmetricKernel _quant_input_kernel; - CLGEMMLowpMatrixMultiplyCoreEx _mm_gemmlowp; + CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp; CLMultiplyScaleFactorKernel _multiply_scale_kernel; CLGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; // TODO(COMPMID-1889): Use CLGEMM to // add bias in diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h deleted file mode 100644 index 68aba74..0000000 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCOREEX_H__ -#define __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCOREEX_H__ - -#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h" -#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h" -#include "arm_compute/runtime/CL/CLTensor.h" -#include "arm_compute/runtime/IFunction.h" -#include "arm_compute/runtime/MemoryGroup.h" - -namespace arm_compute -{ -class IMemoryManager; -class ICLTensor; - -/** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. This function calls the - * following OpenCL kernels: - * - * -# @ref CLGEMMLowpMatrixMultiplyKernel (if the parameter "reshape_b_only_on_first_run" of - * GEMMInfo is FALSE) - * -# @ref CLGEMMLowpMatrixAReductionKernel (if the offset of matrix B is not 0) - * -# @ref CLGEMMLowpMatrixBReductionKernel (if the offset of matrix A is not 0) - * -*/ -class CLGEMMLowpMatrixMultiplyCoreEx : public IFunction -{ -public: - /** Constructor */ - CLGEMMLowpMatrixMultiplyCoreEx(std::shared_ptr memory_manager = nullptr); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGEMMLowpMatrixMultiplyCoreEx(const CLGEMMLowpMatrixMultiplyCoreEx &) = delete; - /** Default move constructor */ - CLGEMMLowpMatrixMultiplyCoreEx(CLGEMMLowpMatrixMultiplyCoreEx &&) = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGEMMLowpMatrixMultiplyCoreEx &operator=(const CLGEMMLowpMatrixMultiplyCoreEx &) = delete; - /** Default move assignment operator */ - CLGEMMLowpMatrixMultiplyCoreEx &operator=(CLGEMMLowpMatrixMultiplyCoreEx &&) = default; - /** Initialise the kernel's inputs, output - * - * @note GEMMLowp: low precision GEMM kernel. [A * B + C] - * This kernel performs the following computations: - * - * -# Convert a values from QASYMM8 to int32 and add a_offset to each of them. - * -# Convert b values from QASYMM8 to int32 and add b_offset to each of them. - * -# Compute the matrix product of the resulting a * b in int32. - * -# Quantize to uint8 if gemm_info.gemmlowp_output_stage != NONE - * - * @param[in] a First input tensor (Matrix A). Data type supported: QASYMM8. - * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a - * @param[in] c Third input tensor (Matrix C). It can be a nullptr. Data type supported: - * S32 - * @param[out] output Output tensor. Data type supported: S32 or QASYMM8 if - * gemm_info.gemmlowp_output_stage != NONE - * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped - * and - * if the reshape of matrix B should be executed only for the first run - */ - void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, - const GEMMInfo &gemm_info = GEMMInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref - * CLGEMMLowpMatrixMultiplyCoreEx - * - * @param[in] a First input tensor info (Matrix A). Data type supported: QASYMM8. - * @param[in] b Second input tensor info (Matrix B). Data type supported: same as @p a - * @param[in] c Third input tensor info (Matrix C). It can be a nullptr. Data type - * supported: S32 - * @param[in] output Output tensor info. Data type supported: S32 or QASYMM8 if - * gemm_info.gemmlowp_output_stage != NONE - * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped - * and - * if the reshape of matrix B should be executed only for the first run - * - * @return a status - */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, - const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo()); - - // Inherited methods overridden: - void run() override; - void prepare() override; - -private: - MemoryGroup _memory_group; - - // Kernels used - CLGEMMLowpMatrixMultiplyKernelEx _mm_midgard_kernel; - CLGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel; - CLGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel; - - // Temporary tensors - CLTensor _vector_sum_col; - CLTensor _vector_sum_row; - - int32_t _a_offset; - int32_t _b_offset; - bool _reshape_b_only_on_first_run; - bool _is_prepared; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCOREEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h deleted file mode 100644 index 5121671..0000000 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_CLLOGICALNOT_H__ -#define __ARM_COMPUTE_CLLOGICALNOT_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -class CLLogicalNot : public ICLSimpleFunction -{ -public: - /** Initialise the function's source and destination. - * - * @param[in] input Source tensor. Data types supported: QASYMM8. - * @param[out] output Output tensor. Data types supported: QASYMM8. - */ - void configure(ICLTensor *input, ICLTensor *output); -}; - -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLLOGICALNOT_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h deleted file mode 100644 index 7fbe558..0000000 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_CLPRELU_H__ -#define __ARM_COMPUTE_CLPRELU_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -class CLPReLU : public ICLSimpleFunction -{ -public: - /** Initialise the function's source and destination. - * - * @param[in] input. Data types supported: - * QASYMM8/F16/F32. - * @param[in] alpha. Data types supported: - * QASYMM8/F16/F32. - * @param[out] output Output tensor. Data types supported: Same as @p input. - */ - void configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output); -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLPRELU_H__*/ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h deleted file mode 100644 index e83fb01..0000000 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/** - * @file CLPixelWiseDivision.h - * @ingroup COM_AI_RUNTIME - * @brief This file contains arm_compute::CLPixelWiseDivision class - */ -#ifndef __ARM_COMPUTE_CLPIXELWISEDIVISION_H__ -#define __ARM_COMPUTE_CLPIXELWISEDIVISION_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -/** - * @brief Class to run @ref CLPixelWiseDivisionKernel. - */ -class CLPixelWiseDivision : public ICLSimpleFunction -{ -public: - /** - * @brief Initialise the kernel's inputs, output and convertion policy. - * @param[in, out] input1 An input tensor. Data types supported: U8/S16/F16/F32 - * The input tensor is [in, out] because its TensorInfo might be - * modified inside the kernel in case of broadcasting of dimension 0. - * @param[in, out] input2 An input tensor. Data types supported: same as @p input1. - * The input tensor is [in, out] because its TensorInfo might be - * modified inside the kernel in case of broadcasting of dimension 0. - * @param[out] output The output tensor, Data types supported: same as @p input1. - * Note: U8 requires both inputs to be U8. - * @param[in] scale Scale to apply after multiplication. - * Scale must be positive and its value must be either 1/255 or - * 1/2^n where n is between 0 and 15. - * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate - * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest - * even. - * @return N/A - */ - void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale = 1.f, - ConvertPolicy overflow_policy = ConvertPolicy::WRAP, - RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO); - - /** - * @brief Static function to check if given info will lead to a valid configuration of @ref - * CLPixelWiseDivision - * @param[in] input1 An input tensor info. Data types supported: U8/S16/F16/F32 - * @param[in] input2 An input tensor info. Data types supported: same as @p input1. - * @param[in] output The output tensor info, Data types supported: same as @p input1. - * Note: U8 requires both inputs to be U8. - * @param[in] scale Scale to apply after multiplication. - * Scale must be positive and its value must be either 1/255 or 1/2^n - * where n is between 0 and 15. - * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate - * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. - * @return a status - */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output, float scale = 1.f, - ConvertPolicy overflow_policy = ConvertPolicy::WRAP, - RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO); -}; -} -#endif /*__ARM_COMPUTE_CLPIXELWISEDIVISION_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h deleted file mode 100644 index b49cbd8..0000000 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_CLRNN_LAYER_EX_H__ -#define __ARM_COMPUTE_CLRNN_LAYER_EX_H__ - -#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h" -#include "arm_compute/core/CL/kernels/CLCopyKernel.h" -#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h" -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" -#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h" -#include "arm_compute/runtime/CL/functions/CLGEMM.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Basic function to run @ref CLRNNLayerEx */ -class CLRNNLayerEx : public IFunction -{ -public: - /** Default constructor */ - CLRNNLayerEx(std::shared_ptr memory_manager = nullptr); - /** Initialize the function - * - * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data - * types supported: F16/F32 - * @param[in] weights Weights tensor of shape [input_size, num_units] that - * multiplies the input. Data types supported: Same as @p input - * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies - * the current 'state'. Data types supported: Same as @p input - * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same - * as @p input - * @param[out] output Output tensor of shape [num_units, batch_size]. Data types - * supported: Same as @p input - * @param[in,out] hidden_state Output tensor of shape [num_units, batch_size]. Data types - * supported: Same as @p input - * @param[in] info Activation layer parameter. - */ - void configure(const ICLTensor *input, const ICLTensor *weights, - const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, - ICLTensor *output, ActivationLayerInfo &info); - /** Initialize the function - * - * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data - * types supported: F16/F32 - * @param[in] weights Weights tensor of shape [input_size, num_units] that multiplies - * the input. Data types supported: Same as @p input - * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the - * current 'state'. Data types supported: Same as @p input - * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same as @p - * input - * @param[in] output Output tensor of shape [num_units, batch_size]. Data types - * supported: Same as @p input - * @param[in] hidden_state Output tensor of shape [num_units, batch_size]. Data types - * supported: Same as @p input - * @param[in] info Activation layer parameter. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *recurrent_weights, const ITensorInfo *bias, - const ITensorInfo *hidden_state, const ITensorInfo *output, - const ActivationLayerInfo &info); - - // Inherited methods overridden: - void run() override; - void prepare() override; - -private: - MemoryGroup _memory_group; - CLGEMM _gemm_state_f; - CLSaturatedArithmeticOperationKernel _add_kernel; - CLActivationLayerKernel _activation_kernel; - CLFullyConnectedLayer _fully_connected_kernel; - CLCopyKernel _copy_kernel; - CLTensor _fully_connected_out; - CLTensor _gemm_output; - CLTensor _add_output; - bool _is_prepared; -}; -} -#endif /* __ARM_COMPUTE_CLRNN_LAYER_EX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h deleted file mode 100644 index 2090b46..0000000 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_CLSPACETODEPTH_H__ -#define __ARM_COMPUTE_CLSPACETODEPTH_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Basic function to run @ref CLSpaceToDepthKernel - * - * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. - * @note The function converts the input tensor to the tensor of the output tensor's type. - */ -class CLSpaceToDepth : public ICLSimpleFunction -{ -public: - /** Initialise the kernel's input and output. - * - * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * @param[block_size] block size integer only - */ - void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size); -}; - -} // namespace arm_compute -#endif /* __ARM_COMPUTE_CLSPACETODEPTH_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h deleted file mode 100644 index 03edd15..0000000 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/** - * @file CLStridedSlice.h - * @ingroup COM_AI_RUNTIME - * @brief This file contains arm_compute::CLStridedSlice and arm_compute::CLStridedSliceCPU class - */ - -#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ -#define __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -/** - * @brief Class to run @ref CLStridedSliceKernel - */ -class CLStridedSliceEx : public ICLSimpleFunction -{ -public: - /** - * @brief Initialise the kernel's inputs and outputs - * @param[in] input Tensor input. Data type supported: - * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 - * @param[out] output Output tensor. Data type supported: Same as @p input - * @param[in] beginData 'begin' vector of strided slice operation - * @param[in] endData 'end' vector of strided slice operation - * @param[in] stridesData 'strides' vector of strided slice operation - * @param[in] beginMask If the ith bit is set, begin[i] is ignored - * @param[in] endMask If the ith bit is set, end[i] is ignored - * @param[in] shrinkAxisMask If the ith bit is set, the ith specification shrinks the - * dimensionality by 1, taking on the value at index begin[i] - * @return N/A - */ - void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, - ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask, - int32_t shrinkAxisMask); -}; -} -#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h index 54a697e..5fb102e 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h @@ -15,7 +15,7 @@ */ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -37,16 +37,11 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ - #ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ #define __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ -#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h" -#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h" - -#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h" - -#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h" +#include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" @@ -54,119 +49,102 @@ namespace arm_compute { -class ICLTensor; -/** Function to run the transpose convolution layer. - * - * @note This layer was copied in order to fix a bug computing to wrong output dimensions. - * - * TransposeConv Layer is the backward pass of Convolution Layer. First we transform the input - * depending on the stride and pad info and then perform a 1x1 - * convolution pass. Input stride defines how many zeroes we should put between each element of the - * input, pad is the amount of padding and finally a is a user - * specified value where a < stride - 1, that increases the padding top and right of the input - * image. - * - * The relation between input to output is as follows: - * \f[ - * width\_output = (width\_input - 1) \cdot stride\_x - \cdot padding\_x + kernel\_x - * \f] - * \f[ - * height\_output = (height\_input - 1) \cdot stride\_y - \cdot padding\_y + kernel\_y - * \f] - * - * where: - * width_input is the size of the first input dimension. - * height_input is the size of the second input dimension. - * width_output is the size of the first output dimension. - * height_output is the size of the second output dimension. - * kernel_x and kernel_y are the convolution sizes in x and y. - * stride_x and stride_y is the input stride of the first and second dimension. - * - * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution. - * Therefore, it will be necessary to use the weights in the - * reverse order to perform an actual convolution. This is achieved by using the @ref - * CPPFlipWeightsKernel. - * - * This function calls the following OpenCL kernels/functions: - * - * -# @ref CLTransposeConvLayerUpsample - * -# @ref CLConvolutionLayer +/** Basic function to compute the deconvolution layer. This function calls the following OpenCL + * kernels/functions: * + * -# @ref CLGEMMDeconvolutionLayer + * -# @ref CLDirectTransposeConvLayer */ class CLTransposeConvLayer : public IFunction { public: - /** Constructor */ + /** Default constructor */ CLTransposeConvLayer(std::shared_ptr memory_manager = nullptr); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLTransposeConvLayer(const CLTransposeConvLayer &) = delete; - /** Default move constructor */ - CLTransposeConvLayer(CLTransposeConvLayer &&) = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLTransposeConvLayer &operator=(const CLTransposeConvLayer &) = delete; - /** Default move assignment operator */ - CLTransposeConvLayer &operator=(CLTransposeConvLayer &&) = default; + /** Set the input, weights, biases and output tensors. * - * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, - * and an optional 4th dimension for batch of inputs. - * Data types supported: QASYMM8/F16/F32. - * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. - * Data type supported: Same as @p input. - * @param[in] bias (Optional) The biases have one dimension. Data type supported: - * Same as @p input. - * @param[out] output Output tensor. The output has the same number of dimensions - * as the @p input. - * @param[in] info Contains padding and policies to be used in the - * transpose convolution, this is decribed in @ref PadStrideInfo. - * @param[in] invalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to top edge of the output. - * @param[in] weights_info (Optional) Weights information needed for @ref - * CLConvolutionLayer, specifies if the weights tensor has been - * reshaped with @ref CLWeightsReshapeKernel. + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an + * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type + * supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. Data type supported: Same + * as @p input. + * @param[out] output Output tensor. The output has the same number of dimensions as the + * @p input. + * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this + * is described in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, + * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. + * */ void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, - const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom, + const PadStrideInfo &deconv_info, unsigned int invalid_right, + unsigned int invalid_bottom, const WeightsInfo &weights_info = WeightsInfo()); + /** Set the input, weights, biases and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and + * an optional 4th dimension for batch of inputs. Data types supported: + * QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data + * type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. Data type supported: + * Same as @p input. + * @param[out] output Output tensor. The output has the same number of dimensions as + * the @p input. + * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, + * this is described in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref + * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref + * CLWeightsReshapeKernel. + * + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, + const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info, + unsigned int invalid_right, unsigned int invalid_bottom, const WeightsInfo &weights_info = WeightsInfo()); /** Static function to check if given info will lead to a valid configuration of @ref - * CLTransposeConvLayer + * CLTransposeConvLayer + * + * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an + * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data + * type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. Data type supported: Same as + * @p input. + * @param[in] output Output tensor info. The output has the same number of dimensions as the + * @p input. + * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is + * described in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, + * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. * - * @param[in] input Input tensor info. 3 lower dimensions represent a single input, - * and an optional 4th dimension for batch of inputs. - * Data types supported: QASYMM8/F16/F32. - * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. - * Data type supported: Same as @p input. - * @param[in] bias (Optional) The biases have one dimension. Data type supported: - * Same as @p input. - * @param[in] output Output tensor info. The output has the same number of dimensions - * as the @p input. - * @param[in] info Contains padding and policies to be used in the - * transpose convolution, this is decribed in @ref PadStrideInfo. - * @param[in] innvalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to top edge of the output. - * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, - * specifies if the weights tensor has been reshaped with @ref - * CLWeightsReshapeKernel. * @return a status */ static Status validate(const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info, - unsigned int innvalid_right, unsigned int invalid_bottom, + const ITensorInfo *bias, ITensorInfo *output, + const PadStrideInfo &deconv_info, unsigned int invalid_right, + unsigned int invalid_bottom, const WeightsInfo &weights_info = WeightsInfo()); + static DeconvolutionMethod + get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, ITensorInfo *output, + const PadStrideInfo &deconv_info, unsigned int invalid_right, + unsigned int invalid_bottom, const WeightsInfo &weights_info); // Inherited methods overridden: void run() override; void prepare() override; private: - MemoryGroup _memory_group; - CLTransposeConvLayerUpsample _scale_f; - CLConvolutionLayer _conv_f; - CPPFlipWeightsKernel _flip_weights; - CLTensor _scaled_output; - ICLTensor *_original_weights; - CLTensor _weights_flipped; - bool _is_prepared; + std::shared_ptr _memory_manager; + std::unique_ptr _function; }; -} +} // namespace arm_compute #endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h deleted file mode 100644 index 7570fe7..0000000 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__ -#define __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__ - -#include "arm_compute/runtime/IFunction.h" - -#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/runtime/IFunction.h" -#include "arm_compute/runtime/IMemoryManager.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Basic function to run @ref CLTransposeConvLayerUpsampleKernel */ -class CLTransposeConvLayerUpsample : public IFunction -{ -public: - /** Default constructor */ - CLTransposeConvLayerUpsample(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLTransposeConvLayerUpsample(const CLTransposeConvLayerUpsample &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLTransposeConvLayerUpsample &operator=(const CLTransposeConvLayerUpsample &) = delete; - /** Allow instances of this class to be moved */ - CLTransposeConvLayerUpsample(CLTransposeConvLayerUpsample &&) = default; - /** Allow instances of this class to be moved */ - CLTransposeConvLayerUpsample &operator=(CLTransposeConvLayerUpsample &&) = default; - /** Default destructor */ - virtual ~CLTransposeConvLayerUpsample() = default; - - /** Initialize the function's source, destination, interpolation type and border_mode. - * - * @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32. - * @param[out] output Destination tensor. Data type supported: same as @p input. - * @param[in] inner_border The number of zeros added to right and top edges of the input. - * @param[in] info Contains padding and policies to be used in the deconvolution. - */ - void configure(ICLTensor *input, ICLTensor *output, const BorderSize &inner_border, - const PadStrideInfo &info); - /** Static function to check if given info will lead to a valid configuration of @ref - * CLTransposeConvLayerUpsample - * - * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32. - * @param[in] output Destination tensor info. Data type supported: same as @p input. - * @param[in] inner_border The number of zeros added to right and top edges of the input. - * @param[in] info Contains padding and policies to be used in the deconvolution. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const BorderSize &inner_border, const PadStrideInfo &info); - - // Inherited methods overridden: - void run() override; - -private: - CLTransposeConvLayerUpsampleKernel _upsample; - ICLTensor *_output; -}; -} -#endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h b/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h deleted file mode 100644 index 666afef..0000000 --- a/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_CPPUPSAMPLE_EX_H__ -#define __ARM_COMPUTE_CPPUPSAMPLE_EX_H__ - -#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h" - -#include "arm_compute/core/Types.h" - -namespace arm_compute -{ -class ITensor; - -/** Basic function to run @ref CPPUpsample */ -class CPPUpsampleEx : public ICPPSimpleFunction -{ -public: - /** Configure the upsample CPP kernel - * - * @param[in] input The input tensor to upsample. Data types supported: F32/F16/QASYMM8 - * @param[out] output The output tensor. Data types supported: Same as @p input - * @param[in] info Padding information - */ - void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info); -}; -} -#endif /* __ARM_COMPUTE_CPPUPSAMPLE_EX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h index 49504fd..3fad230 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h @@ -18,20 +18,13 @@ #include #include -#include -#include #include #include #include #include #include -#include -#include #include -#include #include -#include -#include #include #endif // __ARM_COMPUTE_NEFUNCTIONSEX_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h deleted file mode 100644 index f0f0d81..0000000 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_NECAST_H__ -#define __ARM_COMPUTE_NECAST_H__ - -#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" - -#include "arm_compute/core/Types.h" -#include "arm_compute/core/TypesEx.h" - -namespace arm_compute -{ -// Forward declarations -class ITensor; - -/** Basic function to run @ref NECastKernel that converts an input tensor to the other types */ -class NECast : public INESimpleFunctionNoBorder -{ -public: - /** Configure the kernel. - * - * @param[in] input Source tensor. Data types supported: U8/S8/QASYMM8/U32/S32/F32. - * @param[out] output Destination tensor with the same dimensions of input. Data type supported: - * U8/S8/QASYMM8/U32/S32/F32. - * @param[in] input_subtype Sub data type of input. - */ - void configure(const ITensor *input, ITensor *output, - SubDataType input_subtype = SubDataType::NONE); - /** Static function to check if given info will lead to a valid configuration of @ref NECast - * - * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32. - * @param[in] output Output tensor info. Data type supported: U8/S8/QASYMM8/U32/S32/F32. - * @param[in] input_subtype Sub data type of input. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - SubDataType input_subtype = SubDataType::NONE); -}; -} // namespace arm_compute -#endif /* __ARM_COMPUTE_NECAST_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h deleted file mode 100644 index 005d85a..0000000 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__ -#define __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__ - -#include "arm_compute/runtime/IFunction.h" - -#include "arm_compute/core/Types.h" -#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" - -namespace arm_compute -{ -class ITensor; - -/** Basic function to run @ref NEDepthToSpaceLayerKernelEx. */ -class NEDepthToSpaceLayerEx : public INESimpleFunctionNoBorder -{ -public: - /** Set the input and output tensors. - * - * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: - * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. - * @param[out] output Tensor output. Data types supported: same as @p input - * @param[in] block_shape Block shape value. - */ - void configure(const ITensor *input, ITensor *output, int32_t block_shape); - /** Static function to check if given info will lead to a valid configuration of @ref - * NEDepthToSpaceLayerEx. - * - * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: - * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. - * @param[in] output Tensor output info. Data types supported: same as @p input - * @param[in] block_shape Block shape x value. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape); -}; -} // namespace arm_compute -#endif /* __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h deleted file mode 100644 index 27a38e9..0000000 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2018-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__ -#define __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__ - -#include "arm_compute/runtime/NEON/INESimpleFunction.h" - -namespace arm_compute -{ -class ITensor; - -/** Basic function to perform negative on an input tensor. */ -class NENegLayer : public INESimpleFunction -{ -public: - /** Initialize the function - * - * @param[in] input Input tensor. Data types supported: F16/F32/S32. - * @param[out] output Output tensor. Data types supported: same as @p input. - */ - void configure(const ITensor *input, ITensor *output); - /** Static function to check if given info will lead to a valid configuration of @ref NERsqrtLayer - * - * @param[in] input First tensor input info. Data types supported: F16/F32/S32. - * @param[in] output Output tensor info. Data types supported: Same as @p input. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output); -}; -} // namespace arm_compute -#endif /* __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h index 39c57eb..56548a4 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h @@ -46,7 +46,7 @@ #include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h" #include "arm_compute/core/NEON/kernels/NETransposeKernel.h" #include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" #include "arm_compute/runtime/Tensor.h" @@ -164,7 +164,7 @@ private: MemoryGroup _memory_group; NEFullyConnectedHybridLayerReshapeWeights _reshape_weights_function; NEQuantizationSymmetricKernel _quant_input_kernel; - NEGEMMLowpMatrixMultiplyCoreEx _mm_gemmlowp; + NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp; NEMultiplyScaleFactorKernel _multiply_scale_kernel; NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; Tensor _reshape_weights_output; diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h deleted file mode 100644 index d844513..0000000 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h +++ /dev/null @@ -1,170 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__ -#define __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__ - -#include "arm_compute/core/NEON/INEKernel.h" -#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h" -#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h" -#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h" -#include "arm_compute/runtime/IFunction.h" -#include "arm_compute/runtime/IMemoryManager.h" -#include "arm_compute/runtime/MemoryGroup.h" -// #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" -#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h" -#include "arm_compute/runtime/Tensor.h" - -#include - -namespace arm_compute -{ -class ITensor; - -/** Basic function to execute GEMMLowpMatrixMultiplyCore on NEON. This function calls the following - * NEON kernels if the DOT product instruction is not available: - * - * -# @ref NEGEMMInterleave4x4Kernel - * -# @ref NEGEMMTranspose1xWKernel - * -# @ref NEGEMMLowpMatrixMultiplyKernel - * -# @ref NEGEMMLowpOffsetContributionKernel - * -# @ref NEActivationLayer - * - * otherwise if the DOT product instruction is available: - * - * -# @ref NEGEMMLowpOffsetContributionKernel - * -*/ -class NEGEMMLowpMatrixMultiplyCoreEx : public IFunction -{ -public: - /** Constructor */ - NEGEMMLowpMatrixMultiplyCoreEx(std::shared_ptr memory_manager = nullptr); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEGEMMLowpMatrixMultiplyCoreEx(const NEGEMMLowpMatrixMultiplyCoreEx &) = delete; - /** Default move constructor */ - NEGEMMLowpMatrixMultiplyCoreEx(NEGEMMLowpMatrixMultiplyCoreEx &&) = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEGEMMLowpMatrixMultiplyCoreEx &operator=(const NEGEMMLowpMatrixMultiplyCoreEx &) = delete; - /** Default move assignment operator */ - NEGEMMLowpMatrixMultiplyCoreEx &operator=(NEGEMMLowpMatrixMultiplyCoreEx &&) = default; - /** Initialise the kernel's inputs, output - * - * @note GEMM_LOWP: low precision GEMM kernel - * This kernel performs the following computations: - * - * -# Convert a values from QASYMM8 to int32 and add a_offset to each of them. - * -# Convert b values from QASYMM8 to int32 add b_offset to each of them. - * -# Compute the matrix product of the resulting a * b in int32. - * - * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is - * QASYMM8/QASYMM8_SIGNED otherwise - * - * @param[in] a First input tensor (Matrix A). Data type supported: - * QASYMM8/QASYMM8_SIGNED. - * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a - * @param[in] c Third input tensor (Matrix C). It can be a nullptr. Data type supported: - * S32 - * @param[out] output Output tensor. Data type supported: Data type supported: - * S32/QASYMM8/QASYMM8_SIGNED - * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped - * and - * if the reshape of matrix B should be executed only for the first run - */ - void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, - const GEMMInfo &gemm_info = GEMMInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref - * NEGEMMLowpMatrixMultiplyCoreEx - * - * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is - * QASYMM8/QASYMM8_SIGNED otherwise - * - * @param[in] a First input tensor info (Matrix A). Data type supported: - * QASYMM8/QASYMM8_SIGNED. - * @param[in] b Second input tensor info (Matrix B). Data type supported: same as @p a - * @param[in] c Third input tensor info (Matrix C). It can be a nullptr. Data type - * supported: S32 - * @param[in] output Output tensor info. Data type supported: Data type supported: - * S32/QASYMM8/QASYMM8_SIGNED - * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped - * and - * if the reshape of matrix B should be executed only for the first run - * - * @return a status - */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, - const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo()); - - // Inherited methods overridden - void run() override; - void prepare() override; - -private: - MemoryGroup _memory_group; - NEGEMMAssemblyDispatch _asm_glue; - std::unique_ptr _mm_kernel; - std::unique_ptr _mtx_a_reshape_kernel; - std::unique_ptr _mtx_b_reshape_kernel; - NEGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel; - NEGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel; - NEGEMMLowpOffsetContributionKernel _offset_contribution_kernel; - NEGEMMLowpOffsetContributionOutputStageKernel _offset_contribution_output_stage_kernel; - - Tensor _vector_sum_col; - Tensor _vector_sum_row; - Tensor _tmp_a; - Tensor _tmp_b; - Tensor _mm_result_s32; - Tensor _signed_a; - Tensor _signed_output; - const ITensor *_original_b; - int32_t _a_offset; - int32_t _b_offset; - - bool _run_vector_matrix_multiplication; - bool _assembly_path; - bool _fused_assembly_path; - bool _reshape_b_only_on_first_run; - bool _is_prepared; - bool _fuse_output_stage; - bool _flip_signedness; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h deleted file mode 100644 index ca84133..0000000 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2018-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_NEPRELU_H__ -#define __ARM_COMPUTE_NEPRELU_H__ - -#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" - -namespace arm_compute -{ -class ITensor; - -/** Basic function to run @ref NEPReLUKernel */ -class NEPReLU : public INESimpleFunctionNoBorder -{ -public: - /** Initialise the kernel's inputs and output - * - * @param[in] input. Data types supported: QASYMM8/F32. - * @param[in] alpha. Data types supported: Same as @p input. - * @param[out] output Output tensor. Data types supported: Same as @p input. - */ - void configure(const ITensor *input, const ITensor *alpha, ITensor *output); -}; -} // namespace arm_compute -#endif /* __ARM_COMPUTE_NEPRELU_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h deleted file mode 100644 index 8a7b179..0000000 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_NERNNLAYER_EX_H__ -#define __ARM_COMPUTE_NERNNLAYER_EX_H__ - -#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h" -#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h" -#include "arm_compute/core/NEON/kernels/NECopyKernel.h" - -#include "arm_compute/core/Types.h" -#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h" -#include "arm_compute/runtime/NEON/functions/NEGEMM.h" - -namespace arm_compute -{ -// Forward declarations -class ITensor; - -/** Basic function to run @ref NERNNLayerEx */ -class NERNNLayerEx : public IFunction -{ -public: - /** Default constructor */ - NERNNLayerEx(std::shared_ptr memory_manager = nullptr); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NERNNLayerEx(const NERNNLayerEx &) = delete; - /** Default move constructor */ - NERNNLayerEx(NERNNLayerEx &&) = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NERNNLayerEx &operator=(const NERNNLayerEx &) = delete; - /** Default move assignment operator */ - NERNNLayerEx &operator=(NERNNLayerEx &&) = default; - /** Initialize the function - * - * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data - * types supported: F16/F32 - * @param[in] weights Weights tensor of shape [input_size, num_units] that - * multiplies the input. Data types supported: Same as @p input - * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies - * the current 'state'. Data types supported: Same as @p input - * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same - * as @p input - * @param[out] output Output tensor of shape [num_units, batch_size]. Data types - * supported: Same as @p input - * @param[in,out] hidden_state Output tensor of shape [num_units, batch_size]. Data types - * supported: Same as @p input - * @param[in] info Activation layer parameter. - */ - void configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights, - const ITensor *bias, ITensor *hidden_state, ITensor *output, - ActivationLayerInfo &info); - /** Initialize the function - * - * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data - * types supported: F16/F32 - * @param[in] weights Weights tensor of shape [input_size, num_units] that multiplies - * the input. Data types supported: Same as @p input - * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the - * current 'state'. Data types supported: Same as @p input - * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same as @p - * input - * @param[in] output Output tensor of shape [num_units, batch_size]. Data types - * supported: Same as @p input - * @param[in] hidden_state Output tensor of shape [num_units, batch_size]. Data types - * supported: Same as @p input - * @param[in] info Activation layer parameter. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *recurrent_weights, const ITensorInfo *bias, - const ITensorInfo *hidden_state, const ITensorInfo *output, - const ActivationLayerInfo &info); - - // Inherited methods overridden: - void run() override; - void prepare() override; - -private: - MemoryGroup _memory_group; - NEGEMM _gemm_state_f; - NEArithmeticAdditionKernel _add_kernel; - NEActivationLayerKernel _activation_kernel; - NEFullyConnectedLayer _fully_connected_kernel; - NECopyKernel _copy_kernel; - Tensor _fully_connected_out; - Tensor _gemm_output; - Tensor _add_output; - bool _is_prepared; -}; -} // namespace arm_compute -#endif /* __ARM_COMPUTE_NERNNLAYER_EX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h deleted file mode 100644 index 03ac457..0000000 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__ -#define __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__ - -#include "arm_compute/runtime/IFunction.h" - -#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" -#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" - -namespace arm_compute -{ -class ITensor; - -/** Basic function to perform reduce operation */ -class NEReduceMeanEx : public IFunction -{ -public: - /** Constructor */ - NEReduceMeanEx(std::shared_ptr memory_manager = nullptr); - /** Configure kernel - * - * @note Supported tensor rank: up to 4 - * - * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 - * @param[in] reduction_axis Reduction axis vector. - * @param[in] keep_dims If positive, retains reduced dimensions with length 1. - * @param[out] output Destination tensor. Data type supported: Same as @p input - */ - void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, - ITensor *output); - - /** Static function to check if given info will lead to a valid configuration of @ref - * NEReduceMeanEx - * - * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 - * @param[in] reduction_axis Reduction axis vector. - * @param[in] keep_dims If positive, retains reduced dimensions with length 1. - * @param[in] output Destination tensor. Data type supported: Same as @p input - * - * @return A status - */ - static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, - bool keep_dims, const ITensorInfo *output); - - // Inherited methods overridden: - void run() override; - -private: - MemoryGroup _memory_group; - std::unique_ptr _reduction_kernels{nullptr}; - std::unique_ptr _reduced_outs{nullptr}; - NEReshapeLayer _reshape; - unsigned int _reduction_ops; - bool _keep_dims; -}; -} // namespace arm_compute -#endif /* __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h deleted file mode 100644 index 3b695fb..0000000 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__ -#define __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__ - -#include "arm_compute/runtime/IFunction.h" - -#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h" -#include "arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h" -#include "arm_compute/core/Types.h" - -namespace arm_compute -{ -class ITensor; - -/** Basic function to spatial divide a tensor. This function calls the following NEON - * kernels/functions: - * - * -# @ref NEMemsetKernel - * -# @ref NESpaceToBatchLayerKernel - */ -class NESpaceToBatchLayerEx : public IFunction -{ -public: - /** Default constructor */ - NESpaceToBatchLayerEx(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NESpaceToBatchLayerEx(const NESpaceToBatchLayerEx &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NESpaceToBatchLayerEx &operator=(const NESpaceToBatchLayerEx &) = delete; - /** Allow instances of this class to be moved */ - NESpaceToBatchLayerEx(NESpaceToBatchLayerEx &&) = default; - /** Allow instances of this class to be moved */ - NESpaceToBatchLayerEx &operator=(NESpaceToBatchLayerEx &&) = default; - /** Default destructor */ - virtual ~NESpaceToBatchLayerEx() = default; - /** Set the input and output tensors. - * - * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: - * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. - * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32 - * @param[in] paddings 2-D tensor with shape [2, M]. Data types supported: S32 - * @param[out] output Tensor output. Data types supported: same as @p input - */ - void configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, - ITensor *output); - /** Set the input and output tensors. (Static block shape and paddings) - * - * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: - * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. - * @param[in] block_shape_x Block shape x value. - * @param[in] block_shape_y Block shape y value. - * @param[in] padding_left The left padding of the output tensor. - * @param[in] padding_right The right padding of the output tensor. - * @param[out] output Tensor output. Data types supported: same as @p input - */ - void configure(const ITensor *input, const int block_shape_x, const int block_shape_y, - const Size2D &padding_left, const Size2D &padding_right, ITensor *output); - /** Static function to check if given info will lead to a valid configuration of @ref - * NESpaceToBatchLayerEx - * - * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: - * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. - * @param[in] block_shape block shape tensor info with shape [M]. Data types supported: S32 - * @param[in] paddings paddings tensor info with shape [2, M]. Data types supported: S32 - * @param[in] output Tensor output info. Data types supported: same as @p input - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, - const ITensorInfo *paddings, const ITensorInfo *output); - /** Static function to check if given info will lead to a valid configuration of @ref - * NESpaceToBatchLayerEx (Static block shape and paddings) - * - * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: - * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. - * @param[in] block_shape_x Block shape x value. - * @param[in] block_shape_y Block shape y value. - * @param[in] padding_left The left padding of the output tensor. - * @param[in] padding_right The right padding of the output tensor. - * @param[in] output Tensor output info. Data types supported: same as @p input - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, - const Size2D &padding_left, const Size2D &padding_right, - const ITensorInfo *output); - - // Inherited methods overridden: - void run() override; - -private: - NESpaceToBatchLayerKernel _space_to_batch_kernel; /**< SpaceToBatch kernel to run */ - NEMemsetKernel _memset_kernel; /**< Memset kernel to run */ - bool _has_padding; /**< Flag to check if the output has padding */ -}; -} // namespace arm_compute -#endif /* __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h deleted file mode 100644 index 9f32616..0000000 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__ -#define __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__ - -#include "arm_compute/core/Types.h" -#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" - -namespace arm_compute -{ -class ITensor; - -/** This function calls the following NEON kernels/functions: - * - * -# @ref NESpaceToDepthLayerKernelEx - */ -class NESpaceToDepthLayerEx : public INESimpleFunctionNoBorder -{ -public: - /** Set the input and output tensors. - * - * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: - * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. - * @param[out] output Tensor output. Data types supported: same as @p input - * @param[in] block_shape Block shape value - */ - void configure(const ITensor *input, ITensor *output, int32_t block_shape); - /** Static function to check if given info will lead to a valid configuration of @ref - * NESpaceToDepthLayerEx (Static block shape and paddings) - * - * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: - * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. - * @param[in] output Tensor output info. Data types supported: same as @p input - * @param[in] block_shape Block shape value - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape); -}; -} // namespace arm_compute -#endif /* __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h index 408d150..24ff5da 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h @@ -15,7 +15,7 @@ */ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -37,16 +37,14 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ - #ifndef __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ #define __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ -#include "arm_compute/runtime/CPP/functions/CPPUpsampleEx.h" +#include "arm_compute/runtime/CPP/functions/CPPUpsample.h" #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h" #include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h" -#include "arm_compute/runtime/NEON/functions/NEPermute.h" +#include "arm_compute/runtime/NEON/functions/NEReverse.h" -#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" @@ -59,8 +57,8 @@ namespace arm_compute { /** Function to run the deconvolution layer. * - * Transpose convolution Layer is the backward pass of Convolution Layer. First we transform the - * input depending on the stride and pad info and then perfrom a 1x1 + * Deconvolution Layer is the backward pass of Convolution Layer. First we transform the input + * depending on the stride and pad info and then perfrom a 1x1 * convolution pass. Input stride defines how many zeroes we should put between each element of the * input, pad is the amount of padding and finaly a is a user * specified value where a < stride - 1 that increases the padding top and right of the input image. @@ -81,21 +79,22 @@ namespace arm_compute * kernel_x and kernel_y are the convolution sizes in x and y. * stride_x and stride_y is the input stride of the first and second dimension. * - * The weights used by Transpose convolution are supposed to be the same as the ones used for - * Convolution. Therefore, it will be necessary to use the weights in the - * reverse order to perform an actual convolution. This is achieved by using the @ref - * CPPFlipWeightsKernel. + * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution. + * Therefore, it will be necessary to use the weights in the + * reverse order to perform an actual convolution. This is achieved by using @ref NEReverse. * * This function calls the following NEON kernels/functions: * - * -# @ref CPPUpsample + * -# @ref CPPUpsampleEx * -# @ref NEConvolutionLayer + * -# @ref NEPermute + * -# @ref NEReverse * */ class NETransposeConvLayer : public IFunction { public: - /** Default constructor */ + /** Constructor */ NETransposeConvLayer(std::shared_ptr memory_manager = nullptr); /** Prevent instances of this class from being copied (As this class contains pointers) */ @@ -112,37 +111,38 @@ public: /** Set the input, weights, biases and output tensors. * * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an - * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8. + * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type - * supported: Same as @p input. + * supported: Same as @p input. * @param[in] bias Optional, ignored if NULL. The biases have one dimension. Data type - * supported: Data types supported: S32 for QASYMM8 input, F32 for F32 input, F16 for F16 input. + * supported: Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 + * for F16 input. * @param[out] output Output tensor. The output has the same number of dimensions as the @p - * input. + * input. * @param[in] info Contains padding and policies to be used in the deconvolution, this is - * decribed in @ref PadStrideInfo. - * @param[in] invalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to top edge of the output. + * decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. * */ void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom); /** Static function to check if given info will lead to a valid configuration of @ref - * NETransposeConvLayer + * NETransposeConvLayer * * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an - * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8. + * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type - * supported: Same as @p input. + * supported: Same as @p input. * @param[in] bias (Optional) The biases have one dimension. Data type supported: Data types - * supported: S32 for QASYMM8 input, F32 for F32 input, F16 for F16 input. + * supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input. * @param[in] output Output tensor info. The output has the same number of dimensions as the @p - * input. + * input. * @param[in] info Contains padding and policies to be used in the deconvolution, this is - * decribed in @ref PadStrideInfo. - * @param[in] innvalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to top edge of the output. + * decribed in @ref PadStrideInfo. + * @param[in] innvalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. * * @return a status */ @@ -158,17 +158,11 @@ public: private: MemoryGroup _memory_group; NEConvolutionLayer _conv_f; - CPPUpsampleEx _upsample_f; - CPPFlipWeightsKernel _flip_weights; - NEPermute _permute_input; - NEPermute _permute_weights; - NEPermute _permute_output; + CPPUpsample _upsample_f; + NEReverse _flip_weights; Tensor _scaled_output; Tensor _weights_flipped; - Tensor _permuted_input; - Tensor _permuted_weights; - Tensor _permuted_output; - bool _is_nchw; + Tensor _flip_axis; const ITensor *_original_weights; ITensor *_input; PadStrideInfo _info; diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp index 7b6b974..ba42a24 100644 --- a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp +++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp @@ -55,16 +55,7 @@ using namespace arm_compute; const std::map CLKernelLibraryEx::_kernel_program_map = { // ARMComputeEx kernels - {"arg_op", "arg_operation.cl"}, - {"arithmetic_add_qasymm8", "arithmetic_op_quantized.cl"}, {"binary_logical_op", "binary_logical_op.cl"}, - {"cast", "cast.cl"}, - {"cast_qasymm_in", "cast.cl"}, - {"cast_qasymm_out", "cast.cl"}, - {"comparison_op", "comparison_op.cl"}, - {"comparison_op_qasymm8", "comparison_op_quantized.cl"}, - {"depth_to_space_nchw", "depth_to_space.cl"}, - {"depth_to_space_nhwc", "depth_to_space.cl"}, {"embedding_lookup", "embedding_lookup.cl"}, {"gather_ex", "gather_ex.cl"}, {"gather_ex_1d", "gather_ex.cl"}, @@ -74,10 +65,6 @@ const std::map CLKernelLibraryEx::_kernel_program_map {"instance_normalization_ex", "instance_normalization_ex.cl"}, {"multiply_scale_factor", "multiply_scale_factor.cl"}, {"neg_tensor", "neg_tensor.cl"}, - {"permute_generic", "permute_ex.cl"}, - {"pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl"}, - {"prelu", "prelu.cl"}, - {"prelu_qasymm8", "prelu_quantized.cl"}, {"quantization_symm8", "quantization_symm8.cl"}, {"reduce_min_max", "reduce_operation.cl"}, {"reduce_sum_mean", "reduce_operation.cl"}, @@ -91,29 +78,15 @@ const std::map CLKernelLibraryEx::_kernel_program_map {"radixsort_reorder", "topkv2_radixsort.cl"}, {"topkv2_quicksort", "topkv2_quicksort.cl"}, {"scale_factor_symm8", "scale_factor.cl"}, - {"space_to_depth_nchw", "space_to_depth.cl"}, - {"space_to_depth_nhwc", "space_to_depth.cl"}, }; const std::map CLKernelLibraryEx::_program_source_map = { #ifdef EMBEDDED_KERNELS { - "arg_operation.cl", -#include "./cl_kernels/arg_operation.clembed" - }, - { - "cast.cl", -#include "./cl_kernels/cast.clembed" - }, - { "embedding_lookup.cl", #include "./cl_kernels/embedding_lookup.clembed" }, { - "depth_to_space.cl", -#include "./cl_kernels/depth_to_space.clembed" - }, - { "gather_ex.cl", #include "./cl_kernels/gather_ex.clembed" }, @@ -150,14 +123,6 @@ const std::map CLKernelLibraryEx::_program_source_map #include "./cl_kernels/neg_tensor.clembed" }, { - "prelu.cl", -#include "./cl_kernels/prelu.clembed" - }, - { - "prelu_quantized.cl", -#include "./cl_kernels/prelu_quantized.clembed" - }, - { "quantization_symm8.cl", #include "./cl_kernels/quantization_symm8.clembed" }, @@ -170,10 +135,6 @@ const std::map CLKernelLibraryEx::_program_source_map #include "./cl_kernels/scale_factor.clembed" }, { - "space_to_depth.cl", -#include "./cl_kernels/space_to_depth.clembed" - }, - { "topkv2.cl", #include "./cl_kernels/topkv2.clembed" }, diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl deleted file mode 100644 index 03717cf..0000000 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "helpers.h" - -#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) -/** Perform arg_max/arg_min - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. - * e.g. -DDATA_TYPE=short - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. - * e.g. -DDEPTH_OUT=16 - * @attention Operation type(code) specifying which operation to perform should be passed as - * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1 - * - * @param[in] input_ptr Pointer to the source image. Supported data - * types: - * U8/QASYMM8/S8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension - * (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension - * (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension - * (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element - * in the source image - * @param[in] input_stride_w Stride of the source tensor in W dimension - * (in bytes) - * @param[in] input_step_w output_stride_w * number of elements along W - * processed per workitem(in bytes) - * @param[out] output_ptr Pointer to the destination image. - * Supported data types: U32 - * @param[in] output_stride_x Stride of the destination image in X dimension - * (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension - * (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension - * (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension - * (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W - * processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the - * destination image - * @param[in] axis Axis through which reduction occurs - * @param[in] dim Dimension across the axis to be reduced. - */ - -__kernel void arg_op(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), const int axis, - const int dim) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - - int indices[4] = { - get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT, - get_global_id(2) / DEPTH_OUT, - }; - - DATA_TYPE value = - *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); - DATA_TYPE tval = value; - int idx = 0; - for (int i = 1; i < dim; ++i) - { - indices[axis] = i; - -#if OP_CODE == 1 // ArgMax - value = max(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], - indices[2], indices[3]))); -#elif OP_CODE == 2 // ArgMin - value = min(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], - indices[2], indices[3]))); -#else - return; - -#endif - - if (tval != value) - { - idx = indices[axis]; - tval = value; - } - } - - *((__global uint *)out.ptr) = idx; -} -#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl deleted file mode 100644 index f74c1c1..0000000 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "helpers_asymm.h" - -#ifdef SATURATE -#define ADD(x, y) add_sat((x), (y)) -#define SUB(x, y) sub_sat((x), (y)) -#else /* SATURATE */ -#define ADD(x, y) (x) + (y) -#define SUB(x, y) (x) - (y) -#endif /* SATURATE */ - -/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to - * QASYMM8 - * - * The following computations will be performed: - * - * -# Add offset terms to inputs - -# Get scaled value of two inputs - * -# Add inputs - * -# Add offset terms to final result - * -# Multiply each entry of result by result_mult_int - * -# Shift the int32 accumulator by result_shift - * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. - * - * @attention The inputs and output data types need to be passed at compile time using - * -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: - * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar - * @attention The number of bits to shift left of input tensors must be passed at compile time using - * -DLEFT_SHIFT - * @attention The offset, scalar scale factor and number of bits to shift right of input tensors - * must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT, - -DIN2_OFFSET, - * -RIN2_MULT_INT and -DIN2_SHIFT - * @attention The offset, scalar scale factor and number of bits to shift right of output tensor - * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and - -DRESULT_SHIFT - * - * @attention The input and output data_types need to be passed at compile time using - * -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: - * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar - * @attention The inputs and output scale information of qasymm8 need to be passed at compile time - * using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT: - * e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f - * @attention The inputs and output scale offset need to be passed at compile time using - * -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT: - * e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0 - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. - * -DVEC_SIZE=16 - * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise - * wrapping policy will be used. - * - * @param[in] in1_ptr Pointer to the source tensor. - * Supported data types: QASYMM8 - * @param[in] in1_stride_x Stride of the source tensor in X dimension - * (in bytes) - * @param[in] in1_step_x in1_stride_x * number of elements along X processed - * per workitem(in bytes) - * @param[in] in1_stride_y Stride of the source tensor in Y dimension - * (in bytes) - * @param[in] in1_step_y in1_stride_y * number of elements along Y processed - * per workitem(in bytes) - * @param[in] in1_stride_z Stride of the source tensor in Z dimension - * (in bytes) - * @param[in] in1_step_z in1_stride_z * number of elements along Z processed - * per workitem(in bytes) - * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source - * tensor - * @param[in] in2_ptr Pointer to the source tensor. Supported data types: - * QASYMM8 - * @param[in] in2_stride_x Stride of the source tensor in X dimension - * (in bytes) - * @param[in] in2_step_x in2_stride_x * number of elements along X processed - * per workitem(in bytes) - * @param[in] in2_stride_y Stride of the source tensor in Y dimension - * (in bytes) - * @param[in] in2_step_y in2_stride_y * number of elements along Y processed - * per workitem(in bytes) - * @param[in] in2_stride_z Stride of the source tensor in Z dimension - * (in bytes) - * @param[in] in2_step_z in2_stride_z * number of elements along Z processed - * per workitem(in bytes) - * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source - * tensor - * @param[out] out_ptr Pointer to the destination tensor. - * Supported data types: QASYMM8 - * @param[in] out_stride_x Stride of the destination tensor in X dimension - * (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed - * per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination tensor in Y dimension - * (in bytes) - * @param[in] out_step_y out_stride_y * number of elements along Y processed - * per workitem(in bytes) - * @param[in] out_stride_z Stride of the source tensor in Z dimension - * (in bytes) - * @param[in] out_step_z out_stride_z * number of elements along Z processed - * per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination - * tensor - */ -__kernel void arithmetic_add_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2), - TENSOR3D_DECLARATION(out)) -{ - // Get pixels pointer - Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); - Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); - - // Load data - VEC_DATA_TYPE(int, 16) - in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16)); - VEC_DATA_TYPE(int, 16) - in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16)); - - // Get scaled value of two inputs - VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET); - VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET); - - VEC_DATA_TYPE(int, 16) - left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT); - VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift; - VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift; - - VEC_DATA_TYPE(int, 16) - scaled_in1_val = - ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16); - VEC_DATA_TYPE(int, 16) - scaled_in2_val = - ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16); - - // Add inputs and multiply with a multiplier smaller than 1 - VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val; - VEC_DATA_TYPE(int, 16) - out_val = - ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16); - out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET); - - VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16)); - - // TODO: Apply min-max BOUND to support fuse with relu. - /* - #if defined(MIN_BOUND) - res = max(res, (uchar16)MIN_BOUND); - #endif // defined(MIN_BOUND) - #if defined(MAX_BOUND) - res = min(res, (uchar16)MAX_BOUND); - #endif // defined(MAX_BOUND) - */ - - // Store result - VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr); -} diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl deleted file mode 100644 index 4147a00..0000000 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl +++ /dev/null @@ -1,233 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "helpers.h" - -#ifndef SCALE -#define SCALE 1.0f -#endif -#ifndef OFFSET -#define OFFSET 0 -#endif -#ifndef VEC_SIZE -#define VEC_SIZE 1 -#endif - -#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) -/** Perform a cast operation on an input tensor. - * - * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and - * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. - * -DVEC_SIZE=16 - * @attention -DBOOL_INPUT : Whether type of input is bool. - * - * @param[in] input_ptr Pointer to the source image. Supported data - * types: F16/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in - * bytes) - * @param[in] input_step_x input_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in - * bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source - * image - * @param[out] output_ptr Pointer to the destination image. Supported data - * types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension - * (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension - * (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the - * destination image - */ -__kernel void cast(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) -{ - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VSTORE(VEC_SIZE) - (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr), - VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), - 0, (__global DATA_TYPE_OUT *)output.ptr); - VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE) - res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr), - VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)); -#if defined(BOOL_INPUT) - VEC_DATA_TYPE(char, VEC_SIZE) tmp = CONVERT(res, VEC_DATA_TYPE(char, VEC_SIZE)); - VEC_DATA_TYPE(char, VEC_SIZE) mask = (VEC_DATA_TYPE(char, VEC_SIZE))(1); - res = CONVERT(tmp & mask, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)); -#endif // defined(BOOL_INPUT) - - VSTORE(VEC_SIZE)(res, 0, (__global DATA_TYPE_OUT *)output.ptr); -} - -/** Perform a cast operation on an QASYMM8 input tensor. - * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and - * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int - * @attention Offset and Scale of input should be given as a preprocessor argument using - * -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5 - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. - * -DVEC_SIZE=16 - * - * @param[in] input_ptr Pointer to the source image. Supported data - * types: F16/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in - * bytes) - * @param[in] input_step_x input_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in - * bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source - * image - * @param[out] output_ptr Pointer to the destination image. Supported data - * types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension - * (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension - * (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the - * destination image - */ -__kernel void cast_qasymm_in(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) -{ - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) - in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); - VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET); - VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE); - - VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset; - VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale; - - VSTORE(VEC_SIZE) - (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, - (__global DATA_TYPE_OUT *)output.ptr); -} - -/** Perform a cast operation on an QASYMM8 output tensor. - * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and - * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int - * @attention Offset and Scale of output should be given as a preprocessor argument using - * -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5 - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. - * -DVEC_SIZE=16 - * - * @param[in] input_ptr Pointer to the source image. Supported data - * types: F16/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in - * bytes) - * @param[in] input_step_x input_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in - * bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source - * image - * @param[out] output_ptr Pointer to the destination image. Supported data - * types: U8 - * @param[in] output_stride_x Stride of the destination image in X dimension - * (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension - * (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the - * destination image - */ -__kernel void cast_qasymm_out(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) -{ - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) - in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); - VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET); - VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE); - - VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale; - VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE)); - - VSTORE(VEC_SIZE) - (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, - (__global DATA_TYPE_OUT *)output.ptr); -} -#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl deleted file mode 100644 index 0285c95..0000000 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "helpers.h" - -#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) -/** Perform space to depth rearrangement of tensor - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. - * e.g. -DDEPTH_OUT=16 - * @attention The value of the z-axis of output tensor should be given as a preprocessor argument - * using -DZ_OUT=size. e.g. -DZ_OUT=16 - * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. - * -DBLOCK_SIZE=1 - * - * @param[in] input_ptr Pointer to the source image. Supported data - * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in - * bytes) - * @param[in] input_step_x input_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in - * bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source - * image - * @param[out] output_ptr Pointer to the destination image. Supported data - * types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension - * (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension - * (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in - * bytes) - * @param[in] output_step_w output_stride_w * number of elements along W - * processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the - * destination image - */ -__kernel void depth_to_space_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT); - - int out_index[4] = {0}; - int in_index[4] = {0}; - - out_index[0] = get_global_id(0); // W - out_index[1] = get_global_id(1); // H - out_index[2] = get_global_id(2) % Z_OUT; // C - out_index[3] = get_global_id(2) / Z_OUT; // B - - in_index[0] = out_index[0] / BLOCK_SIZE; - in_index[1] = out_index[1] / BLOCK_SIZE; - in_index[2] = out_index[2] + - ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT; - in_index[3] = out_index[3]; - - *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset( - &in, in_index[0], in_index[1], in_index[2], in_index[3])); -} -#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) - -#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) -/** Perform space to depth rearrangement of tensor (NHWC) - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. - * e.g. -DDEPTH_OUT=16 - * @attention The value of the z-axis of output tensor should be given as a preprocessor argument - * using -DZ_OUT=size. e.g. -DZ_OUT=16 - * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. - * -DBLOCK_SIZE=1 - * - * @param[in] input_ptr Pointer to the source image. Supported data - * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in - * bytes) - * @param[in] input_step_x input_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in - * bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source - * image - * @param[out] output_ptr Pointer to the destination image. Supported data - * types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension - * (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension - * (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in - * bytes) - * @param[in] output_step_w output_stride_w * number of elements along W - * processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the - * destination image - */ -__kernel void depth_to_space_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT); - - int out_index[4] = {0}; - int in_index[4] = {0}; - - out_index[0] = get_global_id(0); // C - out_index[1] = get_global_id(1); // W - out_index[2] = get_global_id(2) % Z_OUT; // H - out_index[3] = get_global_id(2) / Z_OUT; // B - - in_index[0] = out_index[0] + - ((out_index[2] % BLOCK_SIZE) * BLOCK_SIZE + out_index[1] % BLOCK_SIZE) * DEPTH_OUT; - in_index[1] = out_index[1] / BLOCK_SIZE; - in_index[2] = out_index[2] / BLOCK_SIZE; - in_index[3] = out_index[3]; - - *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset( - &in, in_index[0], in_index[1], in_index[2], in_index[3])); -} -#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h index 2d0b6a2..e07a25e 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h @@ -15,7 +15,7 @@ */ /* - * Copyright (c) 2016-2018 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -37,7 +37,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ - #ifndef ARM_COMPUTE_HELPER_H #define ARM_COMPUTE_HELPER_H @@ -59,16 +58,219 @@ #pragma OPENCL EXTENSION cl_arm_printf : enable #endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) +#define GPU_ARCH_MIDGARD 0x100 +#define GPU_ARCH_BIFROST 0x200 + +/** Concatenate two inputs. + * + * @param[in] a The first input to be concatenated + * @param[in] b The second input to be concatenated + * + * @return The concatenated output + */ +#define CONCAT(a, b) a##b + +/** Expand the given vector + * + * @param[in] x The vector to be expanded + * + * @return The expanded output + */ #define EXPAND(x) x +/** Clamp the given value between an upper and lower bound. + * + * @param[in] x The value to be clamped + * @param[in] min_val The lower bound + * @param[in] max_val The upper bound + * + * @return The clamped value. + */ #define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) +/** REVn reverses the given vector whose size is n. + * @name REVn + * + * @param[in] x The vector to be reversed + * + * @return The reversed vector + * @{ + */ +#define REV1(x) ((x)) +#define REV2(x) ((x).s10) +#define REV3(x) ((x).s210) +#define REV4(x) ((x).s3210) +#define REV8(x) ((x).s76543210) +#define REV16(x) ((x).sFEDCBA9876543210) +/** @} */ // end of group REVn + +/** Reverse the given vector. + * @name REVERSE + * + * @param[in] x The vector to be reversed + * @param[in] s The size of the vector + * + * @return The reversed vector + * @{ + */ +#define REVERSE_STR(x, s) REV##s((x)) +#define REVERSE(x, s) REVERSE_STR(x, s) +/** @} */ // end of group REVERSE + +/** Circular-right-shift (rotate-right) the vector of size s by the amount of n. + * @name ROTs_n + * + * @param[in] x The vector to be shifted + * + * @return The shifted vector + * @{ + */ +#define ROT1_0(x) ((x)) + +#define ROT2_0(x) ((x)) +#define ROT2_1(x) ((x).s10) + +#define ROT3_0(x) ((x)) +#define ROT3_1(x) ((x).s201) +#define ROT3_2(x) ((x).s120) + +#define ROT4_0(x) ((x)) +#define ROT4_1(x) ((x).s3012) +#define ROT4_2(x) ((x).s2301) +#define ROT4_3(x) ((x).s1230) + +#define ROT8_0(x) ((x)) +#define ROT8_1(x) ((x).s70123456) +#define ROT8_2(x) ((x).s67012345) +#define ROT8_3(x) ((x).s56701234) +#define ROT8_4(x) ((x).s45670123) +#define ROT8_5(x) ((x).s34567012) +#define ROT8_6(x) ((x).s23456701) +#define ROT8_7(x) ((x).s12345670) + +#define ROT16_0(x) ((x)) +#define ROT16_1(x) ((x).sF0123456789ABCDE) +#define ROT16_2(x) ((x).sEF0123456789ABCD) +#define ROT16_3(x) ((x).sDEF0123456789ABC) +#define ROT16_4(x) ((x).sCDEF0123456789AB) +#define ROT16_5(x) ((x).sBCDEF0123456789A) +#define ROT16_6(x) ((x).sABCDEF0123456789) +#define ROT16_7(x) ((x).s9ABCDEF012345678) +#define ROT16_8(x) ((x).s89ABCDEF01234567) +#define ROT16_9(x) ((x).s789ABCDEF0123456) +#define ROT16_10(x) ((x).s6789ABCDEF012345) +#define ROT16_11(x) ((x).s56789ABCDEF01234) +#define ROT16_12(x) ((x).s456789ABCDEF0123) +#define ROT16_13(x) ((x).s3456789ABCDEF012) +#define ROT16_14(x) ((x).s23456789ABCDEF01) +#define ROT16_15(x) ((x).s123456789ABCDEF0) +/** @} */ // end of group ROTs_n + +/** Circular-right-shift (rotate-right) the given vector by the given amount. + * @name ROTATE + * + * @param[in] x The vector to be shifted + * @param[in] s The size of the vector + * @param[in] n The amount to be shifted + * + * @return The shifted vector + * @{ + */ +#define ROTATE_STR(x, s, n) ROT##s##_##n(x) +#define ROTATE(x, s, n) ROTATE_STR(x, s, n) +/** @} */ // end of group ROTATE + +/** Creates a vector of size n filled with offset values corresponding to the location of each + * element. + * @name V_OFFSn + * + * @param[in] dt The data type of the output vector + * + * @return The vector filled with offset values + * @{ + */ +#define V_OFFS1(dt) (dt)(0) +#define V_OFFS2(dt) (dt)(0, 1) +#define V_OFFS3(dt) (dt)(0, 1, 3) +#define V_OFFS4(dt) (dt)(0, 1, 2, 3) +#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7) +#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) +/** @} */ // end of group V_OFFSn + +/** Create a vector filled with offset values corresponding to the location of each element. + * @name VEC_OFFS + * + * @param[in] dt The data type of the output vector + * @param[in] s The size of the output vector + * + * @return The vector filled with offset values + * @{ + */ +#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) +#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) +/** @} */ // end of group VEC_OFFS + #define VLOAD_STR(size) vload##size #define VLOAD(size) VLOAD_STR(size) #define VSTORE_STR(size) vstore##size #define VSTORE(size) VSTORE_STR(size) +#define float1 float +#define half1 half +#define char1 char +#define uchar1 uchar +#define short1 short +#define ushort1 ushort +#define int1 int +#define uint1 uint +#define long1 long +#define ulong1 ulong +#define double1 double + +#define vload1(OFFSET, PTR) *(OFFSET + PTR) +#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA + +// Convert built-in functions with _sat modifier are not supported in floating point so we create +// defines +// without _sat to overcome this issue +#define convert_float_sat convert_float +#define convert_float1_sat convert_float +#define convert_float2_sat convert_float2 +#define convert_float3_sat convert_float3 +#define convert_float4_sat convert_float4 +#define convert_float8_sat convert_float8 +#define convert_float16_sat convert_float16 +#define convert_half_sat convert_float +#define convert_half1_sat convert_half +#define convert_half2_sat convert_half2 +#define convert_half3_sat convert_half3 +#define convert_half4_sat convert_half4 +#define convert_half8_sat convert_half8 +#define convert_half16_sat convert_half16 + +#define convert_float1 convert_float +#define convert_half1 convert_half +#define convert_char1 convert_char +#define convert_uchar1 convert_uchar +#define convert_short1 convert_short +#define convert_ushort1 convert_ushort +#define convert_int1 convert_int +#define convert_uint1 convert_uint +#define convert_long1 convert_long +#define convert_ulong1 convert_ulong +#define convert_double1 convert_double + +#define convert_char1_sat convert_char_sat +#define convert_uchar1_sat convert_uchar_sat +#define convert_short1_sat convert_short_sat +#define convert_ushort1_sat convert_ushort_sat +#define convert_int1_sat convert_int_sat +#define convert_uint1_sat convert_uint_sat +#define convert_long1_sat convert_long_sat +#define convert_ulong1_sat convert_ulong_sat +#define convert_double1_sat convert_double_sat + #define VEC_DATA_TYPE_STR(type, size) type##size #define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h index a83b1a8..5f1b3f9 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h @@ -15,7 +15,7 @@ */ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -37,29 +37,112 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ - #ifndef ARM_COMPUTE_HELPERS_ASYMM_H #define ARM_COMPUTE_HELPERS_ASYMM_H #include "helpers.h" +/** Convert the given vector with round to nearest even rounding mode + * + * @param[in] x The target to be converted + * @param[in] type The target type + * + * @return The converted vector + */ +#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x))) +#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type) + +/** Quantize a floating-point scalar value to 8-bit asymmetric + * + * @param[in] input Input value to quantize + * @param[in] offset Quantization offset + * @param[in] scale Quantization scale + * + * @return quantized value + */ +inline uchar quantize_qasymm8(float input, float offset, float scale) +{ + float out_f32 = input / scale + offset; + uchar res_u8 = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar); + return res_u8; +} + +/** Dequantize a scalar value from 8-bit asymmetric to floating-point + * + * @param[in] input Input value to quantize + * @param[in] offset Quantization offset + * @param[in] scale Quantization scale + * + * @return quantized value + */ +inline float dequantize_qasymm8(uchar input, float offset, float scale) +{ + return ((float)input - offset) * scale; +} + +/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point + * + * @param[in] input Input value to quantize + * @param[in] offset Quantization offset + * @param[in] scale Quantization scale + * + * @return quantized value + */ +inline float dequantize_qasymm8_signed(char input, float offset, float scale) +{ + return ((float)input - offset) * scale; +} + +/** Quantize a vector of values from floating-point + * + * @param[in] type Output data type. + * @param[in] size Size of vector. + * + * @return quantized values + */ +#define QUANTIZE_IMPL(type, size) \ + inline VEC_DATA_TYPE(type, size) \ + quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \ + { \ + VEC_DATA_TYPE(float, size) \ + out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \ + VEC_DATA_TYPE(type, size) \ + res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), \ + VEC_DATA_TYPE(type, size)); \ + return res; \ + } + +/** Dequantize a vector of values to floating-point + * + * @param[in] type Input data type. + * @param[in] size Size of vector. + * + * @return dequantized values in floating point + */ +#define DEQUANTIZE_IMPL(type, size) \ + inline VEC_DATA_TYPE(float, size) \ + dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \ + { \ + return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \ + } + /** Correctly-rounded-to-nearest division by a power-of-two. * * @param[in] size Size of vector. * * @return Correctly-rounded-to-nearest division by a power-of-two. */ -#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, int exponent) \ - { \ - VEC_DATA_TYPE(int, size) \ - mask = (1 << exponent) - 1; \ - const VEC_DATA_TYPE(int, size) zero = 0; \ - const VEC_DATA_TYPE(int, size) one = 1; \ - VEC_DATA_TYPE(int, size) \ - threshold = (mask >> 1) + select(zero, one, x < 0); \ - return (x >> exponent) + select(zero, one, (x & mask) > threshold); \ +#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size( \ + VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \ + { \ + const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0; \ + const VEC_DATA_TYPE(int, size) one = (VEC_DATA_TYPE(int, size))1; \ + VEC_DATA_TYPE(int, size) \ + mask = (one << exponent) - one; \ + VEC_DATA_TYPE(int, size) \ + threshold = (mask >> 1) + select(zero, one, x < 0); \ + return (x >> exponent) + select(zero, one, (x & mask) > threshold); \ } /** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1), @@ -81,9 +164,19 @@ b_64 = convert_long##size(b); \ VEC_DATA_TYPE(long, size) \ ab_64 = a_64 * b_64; \ - /* COMPMID-907 */ \ + /* Revert COMPMID-907 */ \ + VEC_DATA_TYPE(long, size) \ + mask1 = 1 << 30; \ + VEC_DATA_TYPE(long, size) \ + mask2 = 1 - (1 << 30); \ + VEC_DATA_TYPE(long, size) \ + is_positive_or_zero = ab_64 >= 0; \ + VEC_DATA_TYPE(long, size) \ + nudge = select(mask2, mask1, is_positive_or_zero); \ + VEC_DATA_TYPE(long, size) \ + mask = 1ll << 31; \ VEC_DATA_TYPE(int, size) \ - ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31)); \ + ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask); \ return select(ab_x2_high32, INT_MAX, overflow); \ } @@ -335,9 +428,18 @@ return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \ } +#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale) +#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size) +#define DEQUANTIZE_STR(input, offset, scale, type, size) \ + dequantize_##type##size(input, offset, scale) +#define DEQUANTIZE(input, offset, scale, type, size) \ + DEQUANTIZE_STR(input, offset, scale, type, size) + #define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) \ asymm_rounding_divide_by_POW2_##size(x, exponent) #define ASYMM_MULT(a, b, size) asymm_mult##size(a, b) +#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \ + ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size) #define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \ ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size) #define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \ @@ -360,11 +462,53 @@ #define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \ asymm_rescale##size(value, src_integer_bits, dst_integer_bits) +#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \ + { \ + const int left_shift = shift > 0 ? shift : 0; \ + const int right_shift = shift > 0 ? 0 : -shift; \ + return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), \ + right_shift, size); \ + } +#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \ + multiply_by_quantized_multiplier##size(input, qmul, shift) + +QUANTIZE_IMPL(uchar, 1) +QUANTIZE_IMPL(char, 1) +QUANTIZE_IMPL(uint, 1) +QUANTIZE_IMPL(int, 1) +QUANTIZE_IMPL(uchar, 4) +QUANTIZE_IMPL(ushort, 4) +QUANTIZE_IMPL(short, 4) +QUANTIZE_IMPL(uchar, 16) +QUANTIZE_IMPL(char, 16) +QUANTIZE_IMPL(ushort, 16) +QUANTIZE_IMPL(short, 16) +QUANTIZE_IMPL(uint, 16) +QUANTIZE_IMPL(int, 16) + +DEQUANTIZE_IMPL(uchar, 1) +DEQUANTIZE_IMPL(char, 1) +DEQUANTIZE_IMPL(uint, 1) +DEQUANTIZE_IMPL(int, 1) +DEQUANTIZE_IMPL(uchar, 4) +DEQUANTIZE_IMPL(ushort, 4) +DEQUANTIZE_IMPL(short, 4) +DEQUANTIZE_IMPL(uchar, 16) +DEQUANTIZE_IMPL(char, 16) +DEQUANTIZE_IMPL(ushort, 16) +DEQUANTIZE_IMPL(short, 16) +DEQUANTIZE_IMPL(uint, 16) +DEQUANTIZE_IMPL(int, 16) + +ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1) ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2) ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4) ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8) ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16) +ASYMM_MULT_IMPL(1) ASYMM_MULT_IMPL(2) ASYMM_MULT_IMPL(4) ASYMM_MULT_IMPL(8) @@ -375,16 +519,19 @@ ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4) ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8) ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16) +ASYMM_SELECT_USING_MASK_IMPL(1) ASYMM_SELECT_USING_MASK_IMPL(2) ASYMM_SELECT_USING_MASK_IMPL(4) ASYMM_SELECT_USING_MASK_IMPL(8) ASYMM_SELECT_USING_MASK_IMPL(16) +ASYMM_MASK_IF_ZERO_IMPL(1) ASYMM_MASK_IF_ZERO_IMPL(2) ASYMM_MASK_IF_ZERO_IMPL(4) ASYMM_MASK_IF_ZERO_IMPL(8) ASYMM_MASK_IF_ZERO_IMPL(16) +ASYMM_MASK_IF_NON_ZERO_IMPL(1) ASYMM_MASK_IF_NON_ZERO_IMPL(2) ASYMM_MASK_IF_NON_ZERO_IMPL(4) ASYMM_MASK_IF_NON_ZERO_IMPL(8) @@ -400,6 +547,7 @@ ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4) ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8) ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16) +ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1) ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2) ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4) ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8) @@ -415,9 +563,16 @@ ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16) +ASYMM_RESCALE_IMPL(1) ASYMM_RESCALE_IMPL(2) ASYMM_RESCALE_IMPL(4) ASYMM_RESCALE_IMPL(8) ASYMM_RESCALE_IMPL(16) +MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1) +MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2) +MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4) +MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8) +MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16) + #endif // ARM_COMPUTE_HELPERS_ASYMM_H diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl deleted file mode 100644 index 12c8eeb..0000000 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "helpers.h" - -#ifndef VEC_SIZE -#define VEC_SIZE 1 -#endif - -#if defined(DATA_TYPE) -/** Returns result of prelu function implemented as below: - * f(input) = alpha * input for input < 0, f(input) = input for input >= 0. - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. - * -DVEC_SIZE=16 - * @note Can only take floating point data types. - * - * @param[in] input1_ptr Pointer to the source image. Supported Data - * types : F16/F32 - * @param[in] input1_stride_x Stride of the source image in X dimension (in - * bytes) - * @param[in] input1_step_x input1_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] input1_stride_y Stride of the source image in Y dimension (in - * bytes) - * @param[in] input1_step_y input1_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] input1_step_z input1_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source - * image - * @param[in] alpha_ptr Pointer to the source image. Supported Data - * types : F16/F32 - * @param[in] alpha_stride_x Stride of the source image in X dimension (in - * bytes) - * @param[in] alpha_step_x input2_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] alpha_stride_y Stride of the source image in Y dimension (in - * bytes) - * @param[in] alpha_step_y input2_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] alpha_step_z input2_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source - * image - * - * @param[out] output_ptr Pointer to the destination image. Supported - * data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension - * (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension - * (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the - * destination image - */ -__kernel void prelu(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha), - TENSOR3D_DECLARATION(output)) -{ - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VSTORE(VEC_SIZE) - (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0 - ? VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) * - VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr) - : VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), - 0, (__global DATA_TYPE *)output.ptr); -} -#endif // defined(DATA_TYPE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl deleted file mode 100644 index a66e107..0000000 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "helpers.h" -#define SUB(x, y) (x) - (y) - -#if defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) && \ - defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE) - -#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE) -#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) -#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE) -#define CONVERT_RTE(x, type) (convert_##type##_rte((x))) -#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type) -#define SELECT_TYPE VEC_INT - -/** Returns result of prelu function implemented as below: - * f(input) = alpha * input for input < 0, f(input) = input for input >= 0. - * - * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. - * -DDATA_TYPE_IN=uchar - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. - * -DVEC_SIZE=16 - * @note Can only take uchar data types. - * - * @param[in] input1_ptr Pointer to the source image. Supported Data - * types : QASYMM8 - * @param[in] input1_stride_x Stride of the source image in X dimension (in - * bytes) - * @param[in] input1_step_x input1_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] input1_stride_y Stride of the source image in Y dimension (in - * bytes) - * @param[in] input1_step_y input1_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] input1_step_z input1_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source - * image - * @param[in] alpha_ptr Pointer to the source image. Supported Data - * types : QASYMM8 - * @param[in] alpha_stride_x Stride of the source image in X dimension (in - * bytes) - * @param[in] alpha_step_x input2_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] alpha_stride_y Stride of the source image in Y dimension (in - * bytes) - * @param[in] alpha_step_y input2_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] alpha_step_z input2_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source - * image - * @param[out] output_ptr Pointer to the destination image. Supported - * data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension - * (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension - * (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the - * destination image - */ -__kernel void prelu_qasymm8(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha), - TENSOR3D_DECLARATION(output)) -{ - // Get pixels pointer - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VEC_INT in_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT); - VEC_INT alpha_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT); - - in_vec = SUB(in_vec, (VEC_INT)((int)OFF_IN)); - alpha_vec = SUB(alpha_vec, (VEC_INT)((int)OFF_ALPHA)); - - const VEC_FLOAT inf32 = CONVERT(in_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN); - const VEC_FLOAT alphaf32 = CONVERT(alpha_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_ALPHA); - const VEC_FLOAT outf32 = - select(inf32, inf32 * alphaf32, CONVERT(inf32 < (VEC_FLOAT)0, SELECT_TYPE)); - const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT)); - const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR); - - VSTORE(VEC_SIZE) - (res, 0, (__global uchar *)output.ptr); -} - -#endif // defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) && - // defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl deleted file mode 100644 index eb612f8..0000000 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "helpers.h" - -#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN) -/** Perform space to depth rearrangement of tensor - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. - * e.g. -DDEPTH_IN=16 - * @attention The value of the z-axis of input tensor depth should be given as a preprocessor - * argument using -DZ_IN=size. e.g. -DZ_IN=16 - * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. - * -DBLOCK_SIZE=1 - * - * @param[in] input_ptr Pointer to the source image. Supported data - * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in - * bytes) - * @param[in] input_step_x input_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in - * bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source - * image - * @param[out] output_ptr Pointer to the destination image. Supported data - * types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension - * (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension - * (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in - * bytes) - * @param[in] output_step_w output_stride_w * number of elements along W - * processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the - * destination image - */ -__kernel void space_to_depth_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); - - int out_index[4] = {0}; - int in_index[4] = {0}; - - in_index[0] = get_global_id(0); // W - in_index[1] = get_global_id(1); // H - in_index[2] = get_global_id(2) % Z_IN; // C - in_index[3] = get_global_id(2) / Z_IN; // B - - out_index[0] = in_index[0] / BLOCK_SIZE; - out_index[1] = in_index[1] / BLOCK_SIZE; - out_index[2] = - in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN; - out_index[3] = in_index[3]; - - *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2], - out_index[3])) = *((__global DATA_TYPE *)in.ptr); -} -#endif // defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN) - -#if defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN) -/** Perform space to depth rearrangement of tensor - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. - * e.g. -DDEPTH_IN=16 - * @attention The value of the z-axis of input tensor depth should be given as a preprocessor - * argument using -DZ_IN=size. e.g. -DZ_IN=16 - * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. - * -DBLOCK_SIZE=1 - * - * @param[in] input_ptr Pointer to the source image. Supported data - * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in - * bytes) - * @param[in] input_step_x input_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in - * bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source - * image - * @param[out] output_ptr Pointer to the destination image. Supported data - * types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension - * (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension - * (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in - * bytes) - * @param[in] output_step_w output_stride_w * number of elements along W - * processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the - * destination image - */ -__kernel void space_to_depth_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); - - int out_index[4] = {0}; - int in_index[4] = {0}; - - in_index[0] = get_global_id(0); // C - in_index[1] = get_global_id(1); // W - in_index[2] = get_global_id(2) % Z_IN; // H - in_index[3] = get_global_id(2) / Z_IN; // B - - out_index[0] = - in_index[0] + ((in_index[2] % BLOCK_SIZE) * BLOCK_SIZE + in_index[1] % BLOCK_SIZE) * DEPTH_IN; - out_index[1] = in_index[1] / BLOCK_SIZE; - out_index[2] = in_index[2] / BLOCK_SIZE; - out_index[3] = in_index[3]; - - *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2], - out_index[3])) = *((__global DATA_TYPE *)in.ptr); -} -#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN) diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp deleted file mode 100644 index 06eeb5b..0000000 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis) -{ - TensorShape out_shape{input_shape}; - - out_shape.set(axis, 1); - - return out_shape; -} -} // namespace - -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, - ArgOperation /*op*/) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::S32, DataType::F32, DataType::U8, - DataType::QASYMM8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape().num_dimensions() - 1) != - output->tensor_shape().num_dimensions(), - "Input's rank is not same with output"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, - "Inputs are not broadcast compatible"); - - const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(), - "output shape's size does not match axis"); - - const auto num_dimensions = input->tensor_shape().num_dimensions(); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= num_dimensions, "axis must be less than (input's rank)."); - return Status{}; -} - -} // namespace - -CLArgOperationKernel::CLArgOperationKernel() : _input(nullptr), _output(nullptr), _axis() {} - -void CLArgOperationKernel::configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis, - ArgOperation op) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); - - _input = input; - _output = output; - _axis = axis; - - std::unique_ptr output_info = output->info()->clone(); - output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis)); - - // Construct kernel and set op_code based on type of ArgOperation as specified by object op - std::string kernel_name = "arg_op"; - int op_code = 0; - if (op == ArgOperation::MAX) - { - op_code = 1; - } - else if (op == ArgOperation::MIN) - { - op_code = 2; - } - else - throw std::runtime_error("Operation not supported, yet"); - - // Set kernel build options - std::set build_opts; - build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2))); - build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code)); - - // Create kernel - _kernel = - static_cast(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); - - // Configure kernel window - Window win = calculate_max_window(*output_info, Steps()); - - Coordinates coord; - coord.set_num_dimensions(output_info->num_dimensions()); - output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape())); - - ICLKernel::configure_internal(win); -} - -Status CLArgOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const uint32_t axis, ArgOperation op) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); - - return Status{}; -} - -void CLArgOperationKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const TensorShape &shape_in = _input->info()->tensor_shape(); - - unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters - - _kernel.setArg(idx++, _axis); - _kernel.setArg(idx++, shape_in[_axis]); - - Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); - - // Setup input slice - Window slice_in(slice_out); - slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - slice_in.set(3, Window::Dimension(0, 0, 0)); - - // Copy output's shape in order to use for recovering at end of this method - const TensorShape shape_out = _output->info()->tensor_shape(); - _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis)); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, slice_in); - add_4D_tensor_argument(idx, _output, slice_out); - enqueue(queue, *this, slice_out); - } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); - - // Recover output's shape of output tensor - _output->info()->set_tensor_shape(shape_out); -} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp index bb55568..fbc76f5 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp @@ -43,6 +43,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" +#include "support/StringSupport.h" using namespace arm_compute; diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp deleted file mode 100644 index 01ea655..0000000 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/CL/kernels/CLCastKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -CLCastKernel::CLCastKernel() : _input(nullptr), _output(nullptr) {} - -void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output, SubDataType input_subtype) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); - - _input = input; - _output = output; - - constexpr unsigned int num_elems_processed_per_iteration = 16; - - // Set kernel build options - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.add_option("-DDATA_TYPE_OUT=" + - get_cl_type_from_data_type(output->info()->data_type())); - build_opts.add_option( - ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); - - // Create kernel - if (is_data_type_quantized_asymmetric(input->info()->data_type())) - { - UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform(); - const float scale_in = qinfo.scale; - const int offset_in = qinfo.offset; - build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_in)); - build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_in)); - - _kernel = static_cast( - CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts.options())); - } - else if (is_data_type_quantized_asymmetric(output->info()->data_type())) - { - UniformQuantizationInfo qinfo = output->info()->quantization_info().uniform(); - const float scale_in = qinfo.scale; - const float offset_in = qinfo.offset; - - build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_in)); - build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_in)); - - _kernel = static_cast( - CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts.options())); - } - else - { - build_opts.add_option_if(input_subtype == SubDataType::BOOL, "-DBOOL_INPUT"); - _kernel = static_cast( - CLKernelLibraryEx::get().create_kernel("cast", build_opts.options())); - } - - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, input->info()->valid_region()); - - ICLKernel::configure_internal(win); -} - -void CLCastKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - add_3D_tensor_argument(idx, _output, slice); - enqueue(queue, *this, slice, lws_hint()); - } while (collapsed.slide_window_slice_3D(slice)); -} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp deleted file mode 100644 index 3891368..0000000 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -// TODO Use this validation function -#if 0 -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - const int32_t block_size) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1, - "Block size should be greater than or equal to 1."); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0) * block_size, - "Output width should be equal to (Input width * block size)"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(1) != input->dimension(1) * block_size, - "Output height should be equal to (Input height * block size)"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) % (block_size * block_size) != 0, - "Input depth should be divisible by (block size * block size)"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - output->dimension(2) != input->dimension(2) / (block_size * block_size), - "Output depth should be equal to (Input depth / (block size * block size))"); - - return Status{}; -} -#endif -} // namespace - -CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr) -{ - // DO NOTHING -} - -void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output, - const int32_t block_size) -{ - // TODO Add validation of data_layout - _input = input; - _output = output; - - // Set kernel build options - auto layout_out = output->info()->data_layout(); - std::set build_opts; - build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size)); - auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL); - auto depth = output->info()->dimension(index_depth); - build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(depth)); - build_opts.emplace("-DZ_OUT=" + support::cpp11::to_string(output->info()->tensor_shape().z())); - - // Create kernel - _kernel = static_cast(CLKernelLibraryEx::get().create_kernel( - "depth_to_space_" + lower_string(string_from_data_layout(layout_out)), build_opts)); - - // Configure kernel window - Window win = calculate_max_window(*output->info(), Steps()); - - Coordinates coord; - coord.set_num_dimensions(output->info()->num_dimensions()); - output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); - - ICLKernel::configure_internal(win); -} - -void CLDepthToSpaceKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - - Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); - - // Setup input slice - Window slice_in(slice_out); - slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - slice_in.set(3, Window::Dimension(0, 0, 0)); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, slice_in); - add_4D_tensor_argument(idx, _output, slice_out); - enqueue(queue, *this, slice_out); - } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); -} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp index 79f5ce0..67aaf2d 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp @@ -43,6 +43,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" +#include "support/StringSupport.h" using namespace arm_compute; diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp deleted file mode 100644 index 235e897..0000000 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp +++ /dev/null @@ -1,372 +0,0 @@ -/* - * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h" - -#include "arm_compute/core/AccessWindowStatic.h" -#include "arm_compute/core/AccessWindowTranspose.h" -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "support/ToolchainSupport.h" - -#include -#include -#include - -using namespace arm_compute; -using namespace arm_compute::misc::shape_calculator; - -namespace arm_compute -{ -class Coordinates; -} // namespace arm_compute - -namespace -{ -using ElementsProcessed = Steps; - -Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, - const ITensorInfo *output, const GEMMReshapeInfo &gemm_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, - "The number of dimensions for the matrix A must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, - "The number of dimensions for the matrix B must be <= 3"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 2 && - gemm_info.reinterpret_input_as_3d(), - "The input1 tensor cannot have more than 2 dimensions if input0 " - "has to be reinterpreted as 3D"); - - const int m = gemm_info.m(); - const int n = gemm_info.n(); - const int k = gemm_info.k(); - - ARM_COMPUTE_UNUSED(m); - ARM_COMPUTE_UNUSED(n); - ARM_COMPUTE_UNUSED(k); - - ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast(k)); - ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != static_cast(n)); - ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(1) != static_cast(k)); - if (gemm_info.reinterpret_input_as_3d()) - { - ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != - static_cast(m)); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast(m)); - } - - if (output->total_size() != 0) - { - const TensorInfo tensor_info_output = - output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, false, gemm_info)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); - } - - return Status{}; -} - -std::pair validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, - ITensorInfo *output, - const GEMMReshapeInfo &gemm_info, - ElementsProcessed &num_elements_processed) -{ - unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; - unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); - - Window win{}; - Window win_out{}; - bool window_changed = false; - - // In case both input and output have to be reinterpreted as 3D tensors, - // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if (reinterpret_input_as_3d == reinterpret_output_as_3d) - { - reinterpret_input_as_3d = false; - reinterpret_output_as_3d = false; - } - - // Output tensor auto inizialitation if not yet initialized - auto_init_if_empty(*output, - input0->clone() - ->set_tensor_shape(compute_mm_shape(*input0, *input1, false, gemm_info)) - .set_data_type(DataType::S32)); - - TensorInfo tmp_info(*output); - - if (reinterpret_output_as_3d) - { - // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D - // GEMM, - // the window needs to be constructed on the 2D collapsed version of the tensor - TensorShape tmp_shape(output->tensor_shape()); - tmp_shape.collapse(2U, 1U); - tmp_info.set_tensor_shape(tmp_shape); - } - - // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x - // Note: if the dot product instruction is available, the 8x2 tile has to be used - num_elems_processed_per_iteration_x = 4; - num_elems_processed_per_iteration_y = std::min(static_cast(output->dimension(1)), 4); - - // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor - // The only way to set properly the paddings, it is to set those explicitly through the - // AccessWindowStatic - const int m = reinterpret_input_as_3d ? input0->tensor_shape()[1] * input0->tensor_shape()[2] - : input0->tensor_shape()[1]; - const int bottom_pad = - (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % - num_elems_processed_per_iteration_y; - - // Configure window - win = calculate_max_window( - tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - win_out = calculate_max_window( - *output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), - input0->dimension(1) + bottom_pad); - AccessWindowStatic input1_access( - input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), - input1->dimension(1)); - AccessWindowStatic output_access( - output, 0, 0, ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x), - output->dimension(1) + bottom_pad); - - window_changed = - update_window_and_padding(win, input0_access, - input1_access) || // window used by the execute_window_loop - update_window_and_padding( - win_out, - output_access); // window used to update the padding requirements of output tensor - - Coordinates coord; - coord.set_num_dimensions(output->num_dimensions()); - output_access.set_valid_region(win_out, ValidRegion(coord, output->tensor_shape())); - - // Collapse along the Z direction - // This collapse needs to be here in order to tune the Z dimension of LWS - Window collapsed = win; - const unsigned int dimension_to_collapse = - std::min(static_cast(output->num_dimensions()), 2u); - collapsed = win.collapse(win, dimension_to_collapse); - - Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; - return std::make_pair(err, collapsed); -} -} // namespace - -CLGEMMLowpMatrixMultiplyKernelEx::CLGEMMLowpMatrixMultiplyKernelEx() - : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), - _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false) -{ -} - -void CLGEMMLowpMatrixMultiplyKernelEx::configure(const ICLTensor *input0, const ICLTensor *input1, - ICLTensor *output, - const GEMMReshapeInfo &gemm_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); - - ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(input0->info(), input1->info(), output->info(), gemm_info)); - - _input0 = input0; - _input1 = input1; - _output = output; - _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); - - // In case both input and output have to be reinterpreted as 3D tensors, - // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if (_reinterpret_input_as_3d == _reinterpret_output_as_3d) - { - _reinterpret_input_as_3d = false; - _reinterpret_output_as_3d = false; - } - - // Check if we need to slide the matrix B - const unsigned int num_dimensions_input0 = _reinterpret_input_as_3d - ? _input0->info()->num_dimensions() - 1 - : _input0->info()->num_dimensions(); - _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0); - - ElementsProcessed num_elements_processed{}; - - // Configure kernel window - auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), - gemm_info, num_elements_processed); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); - - // Create build options - std::string kernel_name(" "); - CLBuildOptions build_opts; - build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, - "-DHEIGHT_GEMM3D=" + - support::cpp11::to_string(output->info()->dimension(1))); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, - "-DDEPTH_GEMM3D=" + - support::cpp11::to_string(output->info()->dimension(2))); - build_opts.add_option_if(!_slide_matrix_b, - "-DMATRIX_B_DEPTH=" + - support::cpp11::to_string(input1->info()->dimension(2))); - build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0))); - build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + - support::cpp11::to_string(num_elements_processed.x())); - build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + - support::cpp11::to_string(num_elements_processed.y())); - - kernel_name = "gemmlowp_mm_midgard_ex"; - - // Create kernel - _kernel = static_cast( - CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options())); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); - _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); - _config_id += lower_string(string_from_data_type(input0->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(0)); -} - -Status CLGEMMLowpMatrixMultiplyKernelEx::validate(const ITensorInfo *input0, - const ITensorInfo *input1, - const ITensorInfo *output, - const GEMMReshapeInfo &gemm_info) -{ - ElementsProcessed num_elements_processed{}; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, gemm_info)); - ARM_COMPUTE_RETURN_ON_ERROR( - validate_and_configure_window(input0->clone().get(), input1->clone().get(), - output->clone().get(), gemm_info, num_elements_processed) - .first); - - return Status{}; -} - -void CLGEMMLowpMatrixMultiplyKernelEx::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - if (_input1->info()->num_dimensions() < 3) - { - // The stride_z for matrix B must be zero if we do not slice - ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0); - } - - Window slice = window.first_slice_window_3D(); - Window slice_matrix_b = slice; - - slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); - slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - - if (_reinterpret_input_as_3d) - { - // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor - const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3; - const unsigned int total_cross_plane_pad = - _input0->info()->padding().top + _input0->info()->padding().bottom; - _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); - } - - if (_reinterpret_output_as_3d) - { - // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor - const unsigned int idx0 = - 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0); - const unsigned int total_cross_plane_pad = - _output->info()->padding().top + _output->info()->padding().bottom; - _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); - } - - do - { - Window slice_b = slice; - // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A - // more than 2 - // This scenario can happen when the matrix multiplication is used to perform a convolution - // operation - if (!_slide_matrix_b) - { - slice_b = slice_matrix_b; - } - - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input0, slice); - add_2D_tensor_argument(idx, _input1, slice_b); - add_2D_tensor_argument(idx, _output, slice); - _kernel.setArg(idx++, - static_cast(_input0->info()->strides_in_bytes()[2])); - _kernel.setArg(idx++, - static_cast(_input1->info()->strides_in_bytes()[2])); - _kernel.setArg(idx++, - static_cast(_output->info()->strides_in_bytes()[2])); - enqueue(queue, *this, slice, lws_hint()); - } while (window.slide_window_slice_3D(slice)); -} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp index 3a25987..3bfe3e4 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp @@ -45,6 +45,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" #include "arm_compute/core/UtilsEx.h" +#include "support/StringSupport.h" using namespace arm_compute; diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp index 7fbdcda..930e7c9 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp @@ -43,6 +43,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" +#include "support/StringSupport.h" using namespace arm_compute; @@ -110,7 +111,7 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso _hits = hits; // Make _lookup_indices tensor - _lookup_indices = arm_compute::support::cpp14::make_unique(); + _lookup_indices = support::cpp14::make_unique(); _lookup_indices->allocator()->init( TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32)); _lookup_indices->allocator()->allocate(); diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp index b45f6bb..61c14d2 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp @@ -48,7 +48,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Window.h" - +#include "support/StringSupport.h" #include "support/ToolchainSupport.h" namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp index d305896..6b27c99 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp @@ -49,6 +49,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "support/StringSupport.h" using namespace arm_compute; diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp index 74f7b41..643c8b1 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp @@ -43,6 +43,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" +#include "support/StringSupport.h" using namespace arm_compute; diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp deleted file mode 100644 index 8910a7b..0000000 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/CL/kernels/CLPReLUKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 16; - -Status validate_info(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output) -{ - const TensorShape &out_shape = - TensorShape::broadcast_shape(input->tensor_shape(), alpha->tensor_shape()); - - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, - DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(alpha, 1, DataType::F16, DataType::F32, - DataType::QASYMM8); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, - "Inputs are not broadcast compatible"); - // Validate in case of configured output - if (output->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32, - DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), - "Wrong shape for output"); - } - return Status{}; -} -} // namespace - -CLPReLUKernel::CLPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {} - -void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output) -{ - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, alpha); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_info(input->info(), alpha->info(), output->info())); - - _input = input; - _alpha = alpha; - _output = output; - - // Create kernel - std::string kernel_name = "prelu"; - std::set build_opts; - build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); - build_opts.emplace( - ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); - - if (is_data_type_quantized_asymmetric(input->info()->data_type())) - { - build_opts.emplace("-DOFF_IN=" + support::cpp11::to_string( - input->info()->quantization_info().uniform().offset)); - build_opts.emplace("-DOFF_ALPHA=" + support::cpp11::to_string( - alpha->info()->quantization_info().uniform().offset)); - build_opts.emplace("-DOFF_OUT=" + support::cpp11::to_string( - output->info()->quantization_info().uniform().offset)); - build_opts.emplace("-DSCALE_IN=" + support::cpp11::to_string( - input->info()->quantization_info().uniform().scale)); - build_opts.emplace("-DSCALE_ALPHA=" + support::cpp11::to_string( - alpha->info()->quantization_info().uniform().scale)); - build_opts.emplace("-DSCALE_OUT=" + support::cpp11::to_string( - output->info()->quantization_info().uniform().scale)); - kernel_name += "_qasymm8"; - } - _kernel = - static_cast(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); - - const std::pair broadcast_pair = - ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info()); - - const TensorShape &out_shape = broadcast_pair.first; - const ValidRegion &valid_region = broadcast_pair.second; - - // Auto initialize output if not initialized - { - set_shape_if_empty(*output->info(), out_shape); - - if (input->info()->data_type() == DataType::F16 && alpha->info()->data_type() == DataType::F16) - { - set_format_if_unknown(*output->info(), Format::F16); - } - else if (input->info()->data_type() == DataType::F32 || - alpha->info()->data_type() == DataType::F32) - { - set_format_if_unknown(*output->info(), Format::F32); - } - } - - Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); - Window win_input1 = win.broadcast_if_dimension_le_one(*input->info()); - Window win_input2 = win.broadcast_if_dimension_le_one(*alpha->info()); - - AccessWindowHorizontal input1_access(input->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal input2_access(alpha->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win_input1, input1_access) || - update_window_and_padding(win_input2, input2_access) || - update_window_and_padding(win, output_access); - - output_access.set_valid_region(win, valid_region); - - ICLKernel::configure_internal(win); -} - -void CLPReLUKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const TensorShape &in_shape1 = _input->info()->tensor_shape(); - const TensorShape &in_shape2 = _alpha->info()->tensor_shape(); - const TensorShape &out_shape = _output->info()->tensor_shape(); - - bool can_collapse = true; - if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) - { - can_collapse = - (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) - { - can_collapse = (in_shape1[d] == in_shape2[d]); - } - } - - bool has_collapsed = false; - Window collapsed = - can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) - : window; - - const TensorShape &in_shape1_collapsed = - has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; - const TensorShape &in_shape2_collapsed = - has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; - - Window slice = collapsed.first_slice_window_3D(); - Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); - Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice_input1); - add_3D_tensor_argument(idx, _alpha, slice_input2); - add_3D_tensor_argument(idx, _output, slice); - - enqueue(queue, *this, slice); - - collapsed.slide_window_slice_3D(slice_input1); - collapsed.slide_window_slice_3D(slice_input2); - } while (collapsed.slide_window_slice_3D(slice)); -} - -BorderSize CLPReLUKernel::border_size() const -{ - const unsigned int replicateSize = - _output->info()->dimension(0) - - std::min(_input->info()->dimension(0), _alpha->info()->dimension(0)); - const unsigned int border = - std::min(num_elems_processed_per_iteration - 1U, replicateSize); - return BorderSize(0, border, 0, 0); -} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp index 2d551f6..1a7a18c 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp @@ -49,6 +49,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "support/StringSupport.h" namespace arm_compute { @@ -69,7 +70,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_fac // Output must always be initialized ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); return Status{}; diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp index a983183..06c2579 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp @@ -43,6 +43,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" +#include "support/StringSupport.h" using namespace arm_compute; namespace diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp index ff1904a..8d8853c 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp @@ -48,6 +48,7 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "support/StringSupport.h" #include diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp deleted file mode 100644 index 64fc038..0000000 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - const int32_t block_size) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1, - "Block size should be greater than or equal to 1."); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) != output->dimension(3), - "Input batch should be equal to Output batch"); - - auto layout_out = input->data_layout(); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); - - auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL); - auto index_height = get_data_layout_dimension_index(layout_out, DataLayoutDimension::HEIGHT); - auto index_width = get_data_layout_dimension_index(layout_out, DataLayoutDimension::WIDTH); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - input->dimension(index_depth) * block_size * block_size != output->dimension(index_depth), - "Output depth should be equal to (input depth * block size *block size)"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->dimension(index_width) % block_size) || - (input->dimension(index_height) % block_size), - "Input height and width should be divisible by block size"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - (output->dimension(index_width) != (input->dimension(index_width) / block_size)) || - (output->dimension(index_height) != (input->dimension(index_height) / block_size)), - "Output height and width should be equal to " - "input_height/blocksize and input_width/blocksize respectively"); - - return Status{}; -} - -} // namespace - -CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {} - -void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output, - const int32_t block_size) -{ - - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size)); - - _input = input; - _output = output; - - // Set kernel build options - auto layout_out = input->info()->data_layout(); - std::set build_opts; - build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size)); - auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL); - auto depth = input->info()->dimension(index_depth); - build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(depth)); - build_opts.emplace("-DZ_IN=" + support::cpp11::to_string(input->info()->tensor_shape().z())); - - // Create kernel - _kernel = static_cast(CLKernelLibraryEx::get().create_kernel( - "space_to_depth_" + lower_string(string_from_data_layout(layout_out)), build_opts)); - - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps()); - - Coordinates coord; - coord.set_num_dimensions(output->info()->num_dimensions()); - output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); - - ICLKernel::configure_internal(win); -} - -void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - - Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); - - // Setup output slice - Window slice_out(slice_in); - slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); - slice_out.set(3, Window::Dimension(0, 0, 0)); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, slice_in); - add_4D_tensor_argument(idx, _output, slice_out); - enqueue(queue, *this, slice_in); - } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); -} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp deleted file mode 100644 index 61999cb..0000000 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/CLValidate.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -using namespace arm_compute; - -CLTransposeConvLayerUpsampleKernel::CLTransposeConvLayerUpsampleKernel() - : _input(nullptr), _output(nullptr), _inner_border(), _info() -{ -} - -Status CLTransposeConvLayerUpsampleKernel::validate(const ITensorInfo *input, - const ITensorInfo *output, - const BorderSize &inner_border, - const PadStrideInfo &info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); - - const DataLayout data_layout = input->data_layout(); - - const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - - ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_w) == 0); - ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0); - - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c)); - for (size_t i = 3; i < Coordinates::num_max_dimensions; ++i) - { - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i)); - } - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.right > info.stride().first - 1, - "inner_border_right must be smaller that stride_x"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.top > info.stride().second - 1, - "inner_border_top must be smaller that stride_y"); - - return Status{}; -} - -void CLTransposeConvLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output, - const BorderSize &inner_border, - const PadStrideInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - _input = input; - _output = output; - _inner_border = inner_border; - _info = info; - - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayerUpsampleKernel::validate( - input->info(), output->info(), inner_border, info)); - - // Create kernel - CLBuildOptions build_opts; - build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); - _kernel = static_cast( - CLKernelLibrary::get().create_kernel("deconvolution_upsample", build_opts.options())); - - constexpr unsigned int num_elems_processed_per_iteration = 1; - - // Configure kernel window - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); - - ICLKernel::configure_internal(win); -} - -void CLTransposeConvLayerUpsampleKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const DataLayout data_layout = _input->info()->data_layout(); - - const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - - const int out_start_x = _info.pad_left(); - const int out_end_x = _output->info()->dimension(idx_w) - _inner_border.right - - _info.pad_right() + _info.stride().first - 1; - const int out_step_x = _info.stride().first; - - const int out_start_y = _inner_border.top + _info.pad_top(); - const int out_end_y = - _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1; - const int out_step_y = _info.stride().second; - - switch (data_layout) - { - case DataLayout::NCHW: - { - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - - Window slice_out = collapsed.first_slice_window_3D(); - slice_out.set(Window::DimX, Window::Dimension(out_start_x, out_end_x, out_step_x)); - slice_out.set(Window::DimY, Window::Dimension(out_start_y, out_end_y, out_step_y)); - - Window slice_in = collapsed.first_slice_window_3D(); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice_in); - add_3D_tensor_argument(idx, _output, slice_out); - enqueue(queue, *this, slice_out); - } while (collapsed.slide_window_slice_3D(slice_in) && - collapsed.slide_window_slice_3D(slice_out)); - break; - } - case DataLayout::NHWC: - { - // NOTE: not collapsing in NHWC - Window slice_out = window.first_slice_window_3D(); - slice_out.set(Window::DimY, Window::Dimension(out_start_x, out_end_x, out_step_x)); - slice_out.set(Window::DimZ, Window::Dimension(out_start_y, out_end_y, out_step_y)); - - Window slice_in = window.first_slice_window_3D(); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice_in); - add_3D_tensor_argument(idx, _output, slice_out); - enqueue(queue, *this, slice_out); - } while (window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out)); - break; - } - default: - ARM_COMPUTE_ERROR("Unsupported data layout"); - } -} diff --git a/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp b/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp deleted file mode 100644 index 648afb3..0000000 --- a/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" - -#include -#include - -namespace arm_compute -{ -CPPUpsampleKernelEx::CPPUpsampleKernelEx() : _input(nullptr), _output(nullptr), _info() {} - -bool CPPUpsampleKernelEx::is_parallelisable() const { return false; } - -void CPPUpsampleKernelEx::configure(const ITensor *input, ITensor *output, - const PadStrideInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - _input = input; - _output = output; - _info = info; - - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps()); - - // The CPPUpsampleKernelEx doesn't need padding so update_window_and_padding() can be skipped - Coordinates coord; - coord.set_num_dimensions(output->info()->num_dimensions()); - output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); - - ICPPKernel::configure(win); -} - -void CPPUpsampleKernelEx::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); - - // Initialize _scaled_output buffer - const int width_scaled = _output->info()->dimension(0); - const int height_scaled = _output->info()->dimension(1); - const int stride_x = _info.stride().first; - const int stride_y = _info.stride().second; - const int start_x = _info.pad_left(); - const int start_y = _info.pad_top(); - const int end_y = height_scaled - _info.pad_bottom(); - const int end_x = width_scaled - _info.pad_top(); - const size_t element_size = _input->info()->element_size(); - - // The fill value is normally 0, but for QASYMM8 the '0' corresponds to the offset - const uint8_t fill_value = - _output->info()->data_type() == DataType::QASYMM8 - ? utility::clamp(_output->info()->quantization_info().uniform().offset) - : 0; - // Filling a value different than 0 works only for QASYMM8 datatype since we are filling 1byte - // values in a buffer of uint8_ts - std::fill_n(_output->buffer(), _output->info()->total_size(), fill_value); - - // Create window - Window window_out(window); - window_out.set(Window::DimX, Window::Dimension(start_x, end_x, stride_x)); - window_out.set(Window::DimY, Window::Dimension(start_y, end_y, stride_y)); - - // Create iterators - Iterator in(_input, window); - Iterator out(_output, window_out); - - execute_window_loop( - window, [&](const Coordinates &) { memcpy(out.ptr(), in.ptr(), element_size); }, in, out); -} -} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp deleted file mode 100644 index fbb9dbc..0000000 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp +++ /dev/null @@ -1,671 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/NEON/kernels/NECastKernel.h" - -#include "arm_compute/core/AccessWindowStatic.h" -#include "arm_compute/core/CPP/Validate.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/NEON/NEAsymm.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include - -namespace arm_compute -{ -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - SubDataType input_subtype) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, - DataType::QASYMM8, DataType::U32, - DataType::S32, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON(input_subtype == SubDataType::BOOL && - input->data_type() != DataType::U8); - - if (output->tensor_shape().total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, - DataType::QASYMM8, DataType::U32, - DataType::S32, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); - } - - return Status{}; -} - -std::tuple validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) -{ - // Configure kernel window - Window win = calculate_max_window(*input, Steps()); - - // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32); - - // NECastKernel doesn't need padding so update_window_and_padding() can be skipped - Coordinates coord; - coord.set_num_dimensions(output->num_dimensions()); - output->set_valid_region(ValidRegion(coord, output->tensor_shape())); - - return std::make_tuple(Status{}, win); -} - -typedef struct bool8x16 -{ - uint8x16_t val; -} bool8x16_t; - -static inline uint8x16_t vreinterpretq_u8_b8(bool8x16_t __a) { return (uint8x16_t)__a.val; } - -template inline ToV vcast(const FromV &v) { return v; } -template <> inline uint8x16_t vcast(const bool8x16_t &v) -{ - const uint8x16_t vu8 = vreinterpretq_u8_b8(v); - const uint8x16_t zero_uint8x16 = vdupq_n_u8(0); - uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16); - return vshrq_n_u8(mask, 7); // true -> 1, false -> 0 -} - -template <> inline uint32x4x4_t vcast(const bool8x16_t &v) -{ - const uint8x16_t vu8 = vreinterpretq_u8_b8(v); - const uint8x16_t zero_uint8x16 = vdupq_n_u8(0); - uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16); - uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0 - - const uint32x4x4_t ret = {{ - vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb)))), - vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb)))), - vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb)))), - vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb)))), - }}; - - return ret; -} - -template <> inline int32x4x4_t vcast(const bool8x16_t &v) -{ - const uint8x16_t vu8 = vreinterpretq_u8_b8(v); - const uint8x16_t zero_uint8x16 = vdupq_n_u8(0); - uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16); - uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0 - - const int32x4x4_t ret = {{ - vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))), - vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))), - vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))), - vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))), - }}; - - return ret; -} - -template <> inline float32x4x4_t vcast(const bool8x16_t &v) -{ - const uint8x16_t vu8 = vreinterpretq_u8_b8(v); - const uint8x16_t zero_uint8x16 = vdupq_n_u8(0); - uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16); - uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0 - - const float32x4x4_t ret = {{ - vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))), - vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))), - vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))), - vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))), - }}; - - return ret; -} - -template <> inline uint32x4x4_t vcast(const uint8x16_t &v) -{ - const uint32x4x4_t ret = {{ - vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v)))), - vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v)))), - vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v)))), - vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v)))), - }}; - - return ret; -} - -template <> inline int32x4x4_t vcast(const uint8x16_t &v) -{ - const int32x4x4_t ret = {{ - vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))), - vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))), - vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))), - vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))), - }}; - - return ret; -} - -template <> inline float32x4x4_t vcast(const uint8x16_t &v) -{ - const float32x4x4_t ret = {{ - vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))), - vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))), - vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))), - vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))), - }}; - - return ret; -} - -template <> inline uint8x16_t vcast(const int32x4x4_t &v) -{ - // Saturate cast - return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[0]), vqmovun_s32(v.val[1]))), - vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[2]), vqmovun_s32(v.val[3])))); -} - -template <> inline uint32x4x4_t vcast(const int32x4x4_t &v) -{ - // Saturate cast - const uint32x4x4_t ret = {{ - vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[0]))), - vqmovun_s64(vmovl_s32(vget_high_s32(v.val[0])))), - vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[1]))), - vqmovun_s64(vmovl_s32(vget_high_s32(v.val[1])))), - vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[2]))), - vqmovun_s64(vmovl_s32(vget_high_s32(v.val[2])))), - vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[3]))), - vqmovun_s64(vmovl_s32(vget_high_s32(v.val[3])))), - }}; - - return ret; -} - -template <> inline float32x4x4_t vcast(const int32x4x4_t &v) -{ - const float32x4x4_t ret = {{ - vcvtq_f32_s32(v.val[0]), vcvtq_f32_s32(v.val[1]), vcvtq_f32_s32(v.val[2]), - vcvtq_f32_s32(v.val[3]), - }}; - - return ret; -} - -template <> inline uint8x16_t vcast(const uint32x4x4_t &v) -{ - return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[0]), vqmovn_u32(v.val[1]))), - vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[2]), vqmovn_u32(v.val[3])))); -} - -template <> inline int32x4x4_t vcast(const uint32x4x4_t &v) -{ - const int32x4x4_t ret = {{ - vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[0])))), - vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[0]))))), - vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[1])))), - vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[1]))))), - vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[2])))), - vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[2]))))), - vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[3])))), - vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[3]))))), - }}; - - return ret; -} - -template <> inline float32x4x4_t vcast(const uint32x4x4_t &v) -{ - const float32x4x4_t ret = {{ - vcvtq_f32_u32(v.val[0]), vcvtq_f32_u32(v.val[1]), vcvtq_f32_u32(v.val[2]), - vcvtq_f32_u32(v.val[3]), - }}; - - return ret; -} - -template <> inline uint8x16_t vcast(const float32x4x4_t &v) -{ - // Saturate cast - return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[0])), - vqmovun_s32(vcvtq_s32_f32(v.val[1])))), - vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[2])), - vqmovun_s32(vcvtq_s32_f32(v.val[3]))))); -} - -template <> inline uint32x4x4_t vcast(const float32x4x4_t &v) -{ - const uint32x4x4_t ret = {{ - vcvtq_u32_f32(v.val[0]), vcvtq_u32_f32(v.val[1]), vcvtq_u32_f32(v.val[2]), - vcvtq_u32_f32(v.val[3]), - }}; - - return ret; -} - -template <> inline int32x4x4_t vcast(const float32x4x4_t &v) -{ - const int32x4x4_t ret = {{ - vcvtq_s32_f32(v.val[0]), vcvtq_s32_f32(v.val[1]), vcvtq_s32_f32(v.val[2]), - vcvtq_s32_f32(v.val[3]), - }}; - - return ret; -} - -template struct cast_vector; -template <> struct cast_vector -{ - using type = bool8x16_t; -}; -template <> struct cast_vector -{ - using type = uint8x16_t; -}; -template <> struct cast_vector -{ - using type = uint32x4x4_t; -}; -template <> struct cast_vector -{ - using type = int32x4x4_t; -}; -template <> struct cast_vector -{ - using type = float32x4x4_t; -}; - -template inline void store_result(T *ptr, const typename cast_vector::type &v) -{ - wrapper::vstore(ptr, v.val[0]); - wrapper::vstore(ptr + 4, v.val[1]); - wrapper::vstore(ptr + 8, v.val[2]); - wrapper::vstore(ptr + 12, v.val[3]); -} - -template <> inline void store_result(uint8_t *ptr, const uint8x16_t &v) -{ - wrapper::vstore(ptr, v); -} - -inline bool8x16_t vloadq(const bool *ptr) -{ - bool8x16_t ret; - ret.val = wrapper::vloadq(reinterpret_cast(ptr)); - return ret; -} - -template inline typename cast_vector::type load_input(const T *ptr) -{ - return wrapper::vloadq(ptr); -} - -template <> inline typename cast_vector::type load_input(const bool *ptr) -{ - return vloadq(ptr); -} - -template <> inline typename cast_vector::type load_input(const uint32_t *ptr) -{ - return vld4q_u32(ptr); -} - -template <> inline typename cast_vector::type load_input(const int32_t *ptr) -{ - return vld4q_s32(ptr); -} - -template <> inline typename cast_vector::type load_input(const float *ptr) -{ - return vld4q_f32(ptr); -} - -template inline T get_value(const T *ptr) { return *ptr; } - -template <> inline bool get_value(const bool *ptr) -{ - bool ret = (*ptr != 0); - return ret; -} - -template void run_cast(const ITensor *input, ITensor *output, const Window &window) -{ - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - // Collapse window and reset first dimension to handle tail calculations manually - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - // Create iterators - Iterator in(input, win_collapsed); - Iterator out(output, win_collapsed); - -#ifdef __aarch64__ - constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; -#else //__aarch64__ - constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO; -#endif //__aarch64__ - - execute_window_loop( - win_collapsed, - [&](const Coordinates &) { - const auto in_ptr = reinterpret_cast(in.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - using from_vector = typename cast_vector::type; - const from_vector vin = load_input(in_ptr + x); - - switch (output->info()->data_type()) - { - case DataType::U8: - { - using to_vector = typename cast_vector::type; - const to_vector vout = vcast(vin); - store_result(reinterpret_cast(out.ptr()) + x, vout); - break; - } - case DataType::QASYMM8: - { - using to_vector = typename cast_vector::type; - const UniformQuantizationInfo &qinfo_out = - output->info()->quantization_info().uniform(); - const auto vf = vcast(vin); - const auto vout = vquantize(vf, qinfo_out); - store_result(reinterpret_cast(out.ptr()) + x, vout); - break; - } - case DataType::U32: - { - using to_vector = typename cast_vector::type; - const to_vector vout = vcast(vin); - store_result(reinterpret_cast(out.ptr()) + x, vout); - break; - } - case DataType::S32: - { - using to_vector = typename cast_vector::type; - const to_vector vout = vcast(vin); - store_result(reinterpret_cast(out.ptr()) + x, vout); - break; - } - case DataType::F32: - { - using to_vector = typename cast_vector::type; - const to_vector vout = vcast(vin); - store_result(reinterpret_cast(out.ptr()) + x, vout); - break; - } - default: - ARM_COMPUTE_ERROR("Unsupported data type."); - } - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - FromT val = get_value(in_ptr + x); - switch (output->info()->data_type()) - { - case DataType::U8: - { - *(reinterpret_cast(out.ptr()) + x) = static_cast(val); - break; - } - case DataType::QASYMM8: - { - const QuantizationInfo &qinfo_out = output->info()->quantization_info(); - const auto qval = - quantize_qasymm8(static_cast(val), qinfo_out, rounding_policy); - *(reinterpret_cast(out.ptr()) + x) = qval; - break; - } - case DataType::U32: - { - *(reinterpret_cast(out.ptr()) + x) = static_cast(val); - break; - } - case DataType::S32: - { - *(reinterpret_cast(out.ptr()) + x) = static_cast(val); - break; - } - case DataType::F32: - { - *(reinterpret_cast(out.ptr()) + x) = static_cast(val); - break; - } - default: - ARM_COMPUTE_ERROR("Unsupported data type."); - } - } - }, - in, out); -} - -void run_cast_qasymm8(const ITensor *input, ITensor *output, const Window &window) -{ - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - // Collapse window and reset first dimension to handle tail calculations manually - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - // Create iterators - Iterator in(input, win_collapsed); - Iterator out(output, win_collapsed); - -#ifdef __aarch64__ - constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; -#else //__aarch64__ - constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO; -#endif //__aarch64__ - const auto &qinfo_in = input->info()->quantization_info().uniform(); - const auto &qinfo_out = output->info()->quantization_info().uniform(); - - execute_window_loop( - win_collapsed, - [&](const Coordinates &) { - const auto in_ptr = reinterpret_cast(in.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - using from_vector = typename cast_vector::type; - const auto vf = wrapper::vloadq(in_ptr + x); - const auto vin = vdequantize(vf, qinfo_in); - switch (output->info()->data_type()) - { - case DataType::U8: - { - using to_vector = typename cast_vector::type; - const to_vector vout = vcast(vin); - store_result(reinterpret_cast(out.ptr()) + x, vout); - break; - } - case DataType::QASYMM8: - { - using to_vector = typename cast_vector::type; - const auto vf = vcast(vin); - const auto vout = vquantize(vf, qinfo_out); - store_result(reinterpret_cast(out.ptr()) + x, vout); - break; - } - case DataType::U32: - { - using to_vector = typename cast_vector::type; - const to_vector vout = vcast(vin); - store_result(reinterpret_cast(out.ptr()) + x, vout); - break; - } - case DataType::S32: - { - using to_vector = typename cast_vector::type; - const to_vector vout = vcast(vin); - store_result(reinterpret_cast(out.ptr()) + x, vout); - break; - } - case DataType::F32: - { - using to_vector = typename cast_vector::type; - const to_vector vout = vcast(vin); - store_result(reinterpret_cast(out.ptr()) + x, vout); - break; - } - default: - ARM_COMPUTE_ERROR("Unsupported data type."); - } - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - qasymm8_t qval_in = *(in_ptr + x); - const auto val = dequantize_qasymm8(qval_in, qinfo_in); - - switch (output->info()->data_type()) - { - case DataType::U8: - { - *(reinterpret_cast(out.ptr()) + x) = static_cast(val); - break; - } - case DataType::QASYMM8: - { - const auto qval_out = quantize_qasymm8(val, qinfo_out, rounding_policy); - *(reinterpret_cast(out.ptr()) + x) = qval_out; - break; - } - case DataType::U32: - { - *(reinterpret_cast(out.ptr()) + x) = static_cast(val); - break; - } - case DataType::S32: - { - *(reinterpret_cast(out.ptr()) + x) = static_cast(val); - break; - } - case DataType::F32: - { - *(reinterpret_cast(out.ptr()) + x) = static_cast(val); - break; - } - default: - ARM_COMPUTE_ERROR("Unsupported data type."); - } - } - }, - in, out); -} -} // namespace - -NECastKernel::NECastKernel() : _input(nullptr), _output(nullptr), _input_subtype(SubDataType::NONE) -{ -} - -void NECastKernel::configure(const ITensor *input, ITensor *output, SubDataType input_subtype) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), input_subtype)); - - _input = input; - _output = output; - _input_subtype = input_subtype; - - // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), output->info()); - - ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); - - INEKernel::configure(std::get<1>(win_config)); -} - -Status NECastKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - SubDataType input_subtype) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, input_subtype)); - ARM_COMPUTE_RETURN_ON_ERROR( - std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); - return Status{}; -} - -void NECastKernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - - switch (_input->info()->data_type()) - { - case DataType::U8: - if (_input_subtype == SubDataType::BOOL) - { - run_cast(_input, _output, window); - } - else - { - run_cast(_input, _output, window); - } - break; - case DataType::QASYMM8: - run_cast_qasymm8(_input, _output, window); - break; - case DataType::U32: - run_cast(_input, _output, window); - break; - case DataType::S32: - run_cast(_input, _output, window); - break; - case DataType::F32: - run_cast(_input, _output, window); - break; - default: - ARM_COMPUTE_ERROR("Unsupported data type."); - } -} -} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp deleted file mode 100644 index 95e269d..0000000 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" -#include -#include - -using namespace arm_compute::misc::shape_calculator; - -namespace arm_compute -{ -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); - ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 2); - - const DataLayout data_layout = input->data_layout(); - const int idx_channel = - get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) != - 0); - // Validate output if initialized - if (output->total_size() != 0) - { - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = - get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != - (block_shape * input->tensor_shape()[idx_width])); - ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != - (block_shape * input->tensor_shape()[idx_height])); - ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - } - - return Status{}; -} -} // namespace - -NEDepthToSpaceLayerKernelEx::NEDepthToSpaceLayerKernelEx() - : _input(nullptr), _output(nullptr), _block_shape() -{ -} - -void NEDepthToSpaceLayerKernelEx::configure(const ITensor *input, ITensor *output, - int32_t block_shape) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - TensorShape output_shape = compute_depth_to_space_shape_ex(input->info(), block_shape); - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); - - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape)); - - _input = input; - _output = output; - _block_shape = block_shape; - - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps()); - ICPPKernel::configure(win); -} - -Status NEDepthToSpaceLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output, - int32_t block_shape) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape)); - return Status{}; -} - -void NEDepthToSpaceLayerKernelEx::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); - - const int idx_channel = - get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::CHANNEL); - const int depth_size = _input->info()->dimension(idx_channel); - const int r = (depth_size / (_block_shape * _block_shape)); - const int element_size = _input->info()->element_size(); - - Window slice_out = window.first_slice_window_3D(); - - // The slice_out slice does not move - slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - // Main loop for NCHW and NHWC - if (_input->info()->data_layout() == DataLayout::NCHW) - { - Window slice_in = window.first_slice_window_2D(); - do - { - Iterator in(_input, slice_in); - execute_window_loop(slice_in, - [&](const Coordinates &id) { - const int x = id.x(); - const int y = id.y(); - - const int z = id.z() % r; - const int out_x = x * _block_shape + (id.z() / r) % _block_shape; - const int out_y = y * _block_shape + (id.z() / r) / _block_shape; - Coordinates output_coords{out_x, out_y, z, id[3]}; - memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size); - }, - in); - } while (window.slide_window_slice_2D(slice_in)); - } - else - { - Window slice_in = window.first_slice_window_3D(); - do - { - Iterator in(_input, slice_in); - execute_window_loop(slice_in, - [&](const Coordinates &id) { - const int x = id.y(); - const int y = id.z(); - - const int z = id.x() % r; - const int out_x = x * _block_shape + (id.x() / r) % _block_shape; - const int out_y = y * _block_shape + (id.x() / r) / _block_shape; - Coordinates output_coords{z, out_x, out_y, id[3]}; - memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size); - }, - in); - } while (window.slide_window_slice_3D(slice_in)); - } -} -} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp deleted file mode 100644 index 200fc4f..0000000 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp +++ /dev/null @@ -1,221 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2018-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h" - -#include "arm_compute/core/CPP/Validate.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/IAccessWindow.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEAsymm.h" -#include "arm_compute/core/NEON/NEFixedPoint.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" - -#include -#include -#include -#include -#include - -namespace arm_compute -{ -class Coordinates; - -namespace -{ -template -inline ScalarType elementwise_op_scalar(const ScalarType &a) -{ - switch (op) - { - case ElementWiseUnaryEx::NEG: - return -a; - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } -} - -template -inline VectorType elementwise_op(const VectorType &a) -{ - switch (op) - { - case ElementWiseUnaryEx::NEG: - return wrapper::vneg(a); - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } -} - -template -void elementwise_op(const ITensor *in, ITensor *out, const Window &window) -{ - const int window_step_x = 16 / sizeof(ScalarType); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(in, win); - Iterator output(out, win); - - execute_window_loop(win, - [&](const Coordinates &) { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto input_ptr = reinterpret_cast(input.ptr()); - - int x = window_start_x; - for (; x <= window_end_x - window_step_x; x += window_step_x) - { - wrapper::vstore(output_ptr + x, - elementwise_op(wrapper::vloadq(input_ptr + x))); - } - for (; x < window_end_x; ++x) - { - *(output_ptr + x) = elementwise_op_scalar(*(input_ptr + x)); - } - }, - input, output); -} - -template -std::function -configure_func(const ITensor *input, ITensor *output) -{ - std::string function_to_call("op_"); - function_to_call += string_from_data_type(input->info()->data_type()) + "_"; - function_to_call += string_from_data_type(output->info()->data_type()); - - static std::map - map_function = { - {"op_F32_F32", &elementwise_op}, {"op_S32_S32", &elementwise_op}, - }; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - map_function["op_F16_F16"] = &elementwise_op; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - - auto it = map_function.find(function_to_call); - - if (it != map_function.end()) - { - auto func = it->second; - return [func](const ITensor *input, ITensor *output, const Window &window) { - func(input, output, window); - }; - } - return nullptr; -} -} // namespace - -NEElementwiseUnaryKernelEx::NEElementwiseUnaryKernelEx() - : _function(nullptr), _input(nullptr), _output(nullptr) -{ -} - -void NEElementwiseUnaryKernelEx::configure(ElementWiseUnaryEx op, const ITensor *input, - ITensor *output) -{ - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *output->info())); - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - // Configure kernel window - const std::pair broadcast_pair = - ITensorInfo::broadcast_shape_and_valid_region(*input->info()); - const TensorShape &out_shape = broadcast_pair.first; - const ValidRegion &valid_region = broadcast_pair.second; - - // Auto initialize output if not initialized - auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type()); - - Window win = calculate_max_window(valid_region); - - _input = input; - _output = output; - - INEKernel::configure(win); - - switch (op) - { - case ElementWiseUnaryEx::NEG: - _function = configure_func(input, output); - break; - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } -} - -Status NEElementwiseUnaryKernelEx::validate_arguments(const ITensorInfo &input, - const ITensorInfo &output) -{ - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::F16, DataType::F32, - DataType::S32); - - // Validate in case of configured output - if (output.total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output); - } - - return Status{}; -} - -Status NEElementwiseUnaryKernelEx::validate(ElementWiseUnaryEx op, const ITensorInfo *input, - const ITensorInfo *output) -{ - ARM_COMPUTE_UNUSED(op); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output)); - return Status{}; -} - -void NEElementwiseUnaryKernelEx::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_function == nullptr); - _function(_input, _output, window); -} -} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp deleted file mode 100644 index 641641b..0000000 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp +++ /dev/null @@ -1,291 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h" - -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEAsymm.h" -#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Window.h" - -#include - -using namespace arm_compute; -namespace -{ - -/** Conditional element-wise operations */ -enum class ConditionalOperation -{ - PRELU, /**< (x * y) for x < 0, x for x >= 0 */ -}; - -template -inline ScalarType elementwise_conditional_op_scalar(const ScalarType &a, const ScalarType &b) -{ - auto res = ScalarType(0); - - switch (op) - { - case ConditionalOperation::PRELU: - res = a < 0 ? a * b : a; - break; - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } - return res; -} - -template -inline uint8_t elementwise_conditional_op_quantized_scalar(const float &a, const float &b, - QuantizationInfo qinfo) -{ - return quantize_qasymm8(elementwise_conditional_op_scalar(a, b), qinfo, - RoundingPolicy::TO_NEAREST_UP); -} - -template -inline VectorType elementwise_conditional_op(const VectorType &a, const VectorType &b) -{ - VectorType res = {0, 0, 0, 0}; - VectorType const_0 = {0, 0, 0, 0}; - - switch (op) - { - case ConditionalOperation::PRELU: - res = wrapper::vbsl(wrapper::vcgt(a, const_0), a, wrapper::vmul(a, b)); - ; - break; - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } - return res; -} - -template -inline float32x4x4_t elementwise_conditional_op(const float32x4x4_t &a, const float32x4x4_t &b) -{ - float32x4x4_t out = {{ - elementwise_conditional_op(a.val[0], b.val[0]), - elementwise_conditional_op(a.val[1], b.val[1]), - elementwise_conditional_op(a.val[2], b.val[2]), - elementwise_conditional_op(a.val[3], b.val[3]), - }}; - return out; -} - -template -inline VectorType elementwise_conditional_op_broadcast(const VectorType &a, - const ScalarType &broadcast_value, - const bool reorder) -{ - VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag()); - return elementwise_conditional_op(reorder ? broadcast_vector : a, - reorder ? a : broadcast_vector); -} - -template -inline int elementwise_conditional_op_loop(int window_start_x, int window_end_x, int window_step_x, - const ScalarType *input1_ptr, - const ScalarType *input2_ptr, ScalarType *output_ptr) -{ - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = wrapper::vloadq(input1_ptr + x); - const auto b = wrapper::vloadq(input2_ptr + x); - wrapper::vstore(output_ptr + x, elementwise_conditional_op(a, b)); - } - return x; -} - -template -inline int elementwise_conditional_op_quantized_loop(int window_start_x, int window_end_x, - int window_step_x, const uint8_t *input1_ptr, - const uint8_t *input2_ptr, uint8_t *output_ptr, - int32x4_t voffset1, int32x4_t voffset2, - float32x4_t vscale1, float32x4_t vscale2, - float32x4_t voffseto, float32x4_t invvscaleo) -{ - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Get inputs and compute output - const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1); - const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2); - const float32x4x4_t rf = elementwise_conditional_op(af, bf); - store_quantized(output_ptr + x, rf, voffseto, invvscaleo); - } - return x; -} - -template -inline int elementwise_conditional_op_broadcast_loop(int window_start_x, int window_end_x, - int window_step_x, - const ScalarType *non_broadcast_input_ptr, - const ScalarType &broadcast_value, - ScalarType *output_ptr, const bool reorder) -{ - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = wrapper::vloadq((non_broadcast_input_ptr + x)); - wrapper::vstore(output_ptr + x, - elementwise_conditional_op_broadcast(a, broadcast_value, reorder)); - } - return x; -} - -template -inline int elementwise_conditional_op_quantized_broadcast_loop( - int window_start_x, int window_end_x, int window_step_x, const uint8_t *non_broadcast_input_ptr, - float32x4x4_t broadcast_vector, uint8_t *output_ptr, int32x4_t voffset_non_broadcast, - float32x4_t vscale_non_broadcast, float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) -{ - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float32x4x4_t af = - load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); - const float32x4x4_t rf = elementwise_conditional_op(reorder ? broadcast_vector : af, - reorder ? af : broadcast_vector); - store_quantized(output_ptr + x, rf, voffseto, invvscaleo); - } - return x; -} - -template -void elementwise_conditional_op(const ITensor *in1, const ITensor *in2, ITensor *out, - const Window &window) -{ - elementwise_op(in1, in2, out, window, &elementwise_conditional_op_scalar, - &elementwise_conditional_op_broadcast_loop, - &elementwise_conditional_op_loop); -} - -template -void elementwise_conditional_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, - const Window &window) -{ - elementwise_op_quantized(in1, in2, out, window, &elementwise_conditional_op_quantized_scalar, - &elementwise_conditional_op_quantized_broadcast_loop, - &elementwise_conditional_op_quantized_loop); -} -} // namespace - -NEPReLUKernel::NEPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {} - -void NEPReLUKernel::configure(const ITensor *input, const ITensor *alpha, ITensor *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, alpha, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *alpha->info(), *output->info())); - - // Configure kernel window - const std::pair broadcast_pair = - ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info()); - const TensorShape &out_shape = broadcast_pair.first; - const ValidRegion &valid_region = broadcast_pair.second; - - // Auto initialize output if not initialized - auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type()); - - Window win = calculate_max_window(valid_region); - - _input = input; - _alpha = alpha; - _output = output; - INEKernel::configure(win); -} - -void NEPReLUKernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - if (_input->info()->data_type() == DataType::F32) - { - elementwise_conditional_op(_input, _alpha, - _output, window); - } - else if (_input->info()->data_type() == DataType::QASYMM8) - { - elementwise_conditional_op_quantized(_input, _alpha, _output, - window); - } - else - { - ARM_COMPUTE_ERROR("Wrong Type"); - } -} - -Status NEPReLUKernel::validate_arguments(const ITensorInfo &input, const ITensorInfo &alpha, - const ITensorInfo &output) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &alpha, &output); - - const TensorShape out_shape = - TensorShape::broadcast_shape(input.tensor_shape(), alpha.tensor_shape()); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, - "Inputs are not broadcast compatible"); - - // Checks performed when output is configured - if (output.total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - detail::have_different_dimensions(out_shape, output.tensor_shape(), 0), - "Wrong shape for output"); - } - - return Status{}; -} - -Status NEPReLUKernel::validate(const ITensorInfo *input, const ITensorInfo *alpha, - const ITensorInfo *output) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, alpha, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *alpha, *output)); - - return Status{}; -} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp index 6ba0f1f..5841f1d 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp @@ -64,7 +64,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16, DataType::F32); diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp deleted file mode 100644 index 44feb20..0000000 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" -#include -#include - -using namespace arm_compute::misc::shape_calculator; - -namespace arm_compute -{ -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); - - ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1); - - // Validate output if initialized - if (output->total_size() != 0) - { - const DataLayout data_layout = input->data_layout(); - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = - get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const int idx_channel = - get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - const int idx_batch = - get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); - ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_width] % block_shape != 0); - ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_height] % block_shape != 0); - ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] != - output->tensor_shape()[idx_batch]); - ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_channel] % (block_shape * block_shape) != - 0); - ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size() != - output->tensor_shape().total_size()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - } - - return Status{}; -} -} // namespace - -NESpaceToDepthLayerKernelEx::NESpaceToDepthLayerKernelEx() - : _input(nullptr), _output(nullptr), _block_shape() -{ -} - -void NESpaceToDepthLayerKernelEx::configure(const ITensor *input, ITensor *output, - int32_t block_shape) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - TensorShape output_shape = compute_space_to_depth_shape_ex(input->info(), block_shape); - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type()); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape)); - - _input = input; - _block_shape = block_shape; - _output = output; - - // Configure kernel window - Window win = calculate_max_window(*output->info(), Steps()); - INEKernel::configure(win); -} - -Status NESpaceToDepthLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output, - int32_t block_shape) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape)); - return Status{}; -} - -void NESpaceToDepthLayerKernelEx::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); - - const DataLayout data_layout = _input->info()->data_layout(); - const int channel_idx = - get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - const int element_size = _input->info()->element_size(); - - const size_t channel_size = _input->info()->dimension(channel_idx); - - Window slice_out = window.first_slice_window_3D(); - - int batch_id = 0; - - // Main loop for NCHW and NHWC - if (_output->info()->data_layout() == DataLayout::NCHW) - { - do - { - Iterator out(_output, slice_out); - execute_window_loop(slice_out, - [&](const Coordinates &id) { - const size_t channel_id = id.z(); - const size_t in_x = - id.x() * _block_shape + (channel_id / channel_size) % _block_shape; - const size_t in_y = - id.y() * _block_shape + (channel_id / channel_size) / _block_shape; - const int z = channel_id % channel_size; - Coordinates input_coords{in_x, in_y, z, batch_id}; - memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); - }, - out); - ++batch_id; - } while (window.slide_window_slice_3D(slice_out)); - } - else - { - do - { - Iterator out(_output, slice_out); - execute_window_loop(slice_out, - [&](const Coordinates &id) { - const size_t channel_id = id.x(); - const size_t in_x = - id.y() * _block_shape + (channel_id / channel_size) % _block_shape; - const size_t in_y = - id.z() * _block_shape + (channel_id / channel_size) / _block_shape; - const int z = channel_id % channel_size; - Coordinates input_coords{z, in_x, in_y, batch_id}; - memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); - }, - out); - ++batch_id; - } while (window.slide_window_slice_3D(slice_out)); - } -} -} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp deleted file mode 100644 index 2d379cf..0000000 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/CL/functions/CLArgOperation.h" - -#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h" -#include "arm_compute/runtime/CL/CLScheduler.h" - -namespace arm_compute -{ - -CLArgOperation::CLArgOperation() -{ - // DO NOTHING -} - -void CLArgOperation::configure(ICLTensor *input, ICLTensor *output, std::vector axis, - ArgOperation op) -{ - ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), axis, output->info(), op)); - _input = input; - _output = output; - _axis = axis; - _arg_op = op; - // NOTE The argminmax_axis must have no duplication. - _num_of_kernels = axis.size(); - const size_t num_of_interm_tensors = _num_of_kernels - 1; - - _interm_tensors = arm_compute::support::cpp14::make_unique(num_of_interm_tensors); - _argop_kernels = - arm_compute::support::cpp14::make_unique(_num_of_kernels); - - TensorShape shape{input->info()->tensor_shape()}; - for (size_t i = 0; i < num_of_interm_tensors; i++) - { - shape.set(_axis[i], 1); - _interm_tensors[i].allocator()->init( - TensorInfo(shape, input->info()->num_channels(), input->info()->data_type()) - .set_data_layout(input->info()->data_layout())); - _interm_tensors[i].allocator()->allocate(); - } - - // Set a vector that is ordered ICLTensors sequentially. - std::vector tensors; - tensors.emplace_back(input); - for (size_t i = 0; i < num_of_interm_tensors; i++) - { - tensors.emplace_back(_interm_tensors.get() + i); - } - tensors.emplace_back(output); - - // Apply ArgMinMax on all kernels - for (size_t i = 0; i < _num_of_kernels; i++) - { - _argop_kernels[i].configure(tensors[i], tensors[i + 1], _axis[i], op); - } -} - -Status CLArgOperation::validate(const ITensorInfo *input, const std::vector &axis, - const ITensorInfo *output, ArgOperation op) -{ - const size_t num_of_kernels = axis.size(); - const size_t num_of_interm_tensors = num_of_kernels - 1; - - // Create temporary tensor infos - auto interm_tensors = - arm_compute::support::cpp14::make_unique(num_of_interm_tensors); - - // Create intermediate tensor info - TensorShape shape{input->tensor_shape()}; - - for (size_t i = 0; i < num_of_interm_tensors; i++) - { - shape.set(axis[i], 1); - interm_tensors[i].set_data_type(input->data_type()); - interm_tensors[i].set_tensor_shape(shape); - interm_tensors[i].set_num_channels(input->num_channels()); - } - - // Set a vector that is ordered ITensorInfo sequentially. - std::vector tensors; - tensors.emplace_back(input); - for (size_t i = 0; i < num_of_interm_tensors; i++) - { - tensors.emplace_back(interm_tensors.get() + i); - } - tensors.emplace_back(output); - - // Validate argminmax only on all kernels - for (size_t i = 0; i < num_of_kernels; i++) - { - ARM_COMPUTE_RETURN_ON_ERROR( - CLArgOperationKernel::validate(tensors[i], tensors[i + 1], axis[i], op)); - } - - return Status{}; -} - -void CLArgOperation::run() -{ - for (size_t i = 0; i < _num_of_kernels; ++i) - { - CLScheduler::get().enqueue(_argop_kernels[i]); - } -} - -} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp index 92ee69a..e5122ab 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp @@ -48,7 +48,7 @@ using namespace arm_compute; void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, BinaryLogicalOperation op) { - auto k = arm_compute::support::cpp14::make_unique(); + auto k = support::cpp14::make_unique(); k->configure(input1, input2, output, op); _kernel = std::move(k); diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp deleted file mode 100644 index b3118f3..0000000 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/CL/functions/CLCast.h" - -#include "arm_compute/core/CL/kernels/CLCastKernel.h" - -using namespace arm_compute; - -void CLCast::configure(ICLTensor *input, ICLTensor *output, SubDataType input_subtype) -{ - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, input_subtype); - _kernel = std::move(k); -} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp deleted file mode 100644 index db66250..0000000 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/CL/functions/CLDepthToSpace.h" - -#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h" - -using namespace arm_compute; - -void CLDepthToSpace::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size) -{ - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, block_size); - _kernel = std::move(k); -} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp new file mode 100644 index 0000000..3dede05 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp @@ -0,0 +1,267 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/UtilsEx.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include +#include + +namespace arm_compute +{ +using namespace arm_compute::misc::shape_calculator; + +CLDirectTransposeConvLayer::CLDirectTransposeConvLayer( + std::shared_ptr memory_manager) // NOLINT + : _memory_group(std::move(memory_manager)), + _scale_f(), + _conv_f(), + _flip_weights(), + _scaled_output(), + _original_weights(nullptr), + _weights_flipped(), + _flip_axis(), + _is_prepared(false) +{ +} + +Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, ITensorInfo *output, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom, + const WeightsInfo &weights_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); + const DataLayout data_layout = input->data_layout(); + + const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1); + + auto out_dims = transposeconv_output_dimensions( + input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), + weights->dimension(idx_h), info, invalid_right, invalid_bottom); + + const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights); + + if (bias != nullptr) + { + if (is_data_type_quantized_asymmetric(input->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + } + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w], + "Output's width is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h], + "Output's height is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c], + "Output's depth is invalid."); + + unsigned int pad_left = 0; + unsigned int pad_right = 0; + unsigned int pad_top = 0; + unsigned int pad_bottom = 0; + const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( + *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top, + pad_bottom); + TensorInfo scale_out_info(input->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(scale_out_shape) + .set_data_layout(data_layout)); + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + + ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, + conv_info, weights_info)); + + return Status{}; +} + +void CLDirectTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, + const ICLTensor *bias, ICLTensor *output, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom, + const WeightsInfo &weights_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info, + invalid_right, invalid_bottom, weights_info); +} + +void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, ICLTensor *weights, + const ICLTensor *bias, ICLTensor *output, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom, + const WeightsInfo &weights_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + unsigned int pad_left = 0; + unsigned int pad_right = 0; + unsigned int pad_top = 0; + unsigned int pad_bottom = 0; + const unsigned int stride_x = info.stride().first; + const unsigned int stride_y = info.stride().second; + + const DataLayout data_layout = input->info()->data_layout(); + + const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + _original_weights = weights; + _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32)); + _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); + _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis); + + auto out_dims = transposeconv_output_dimensions( + input->info()->dimension(idx_w), input->info()->dimension(idx_h), + weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right, + invalid_bottom); + + const TensorShape output_shape = + compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); + + // Output auto initialization if not yet initialized + auto_init_if_empty( + *output->info(), + input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout)); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(CLDirectTransposeConvLayer::validate( + input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), + info, invalid_right, invalid_bottom)); + + _is_prepared = weights_info.retain_internal_weights(); + + _memory_group.manage(&_scaled_output); + + // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order + // to match output shape + const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( + *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, + pad_right, pad_top, pad_bottom); + + TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + scale_out_info.set_data_layout(data_layout); + _scaled_output.allocator()->init(scale_out_info); + + // configure scale function + const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, + DimensionRoundingType::FLOOR); + _scale_f.configure(input, &_scaled_output, upsample_info); + + // Setup the function to convolve the upscaled output + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + _conv_f.configure(compile_context, &_scaled_output, &_weights_flipped, bias, output, conv_info, + weights_info); + _scaled_output.allocator()->allocate(); + + // Setup flip axis data + _flip_axis.allocator()->allocate(); + _flip_axis.map(true); + auto axis_data = reinterpret_cast(_flip_axis.buffer()); + if (weights->info()->data_layout() == DataLayout::NHWC) + { + axis_data[0] = 1; + axis_data[1] = 2; + } + else + { + axis_data[0] = 0; + axis_data[1] = 1; + } + _flip_axis.unmap(); +} + +void CLDirectTransposeConvLayer::run() +{ + prepare(); + + MemoryGroupResourceScope scope_mg(_memory_group); + + _scale_f.run(); + _conv_f.run(); +} + +void CLDirectTransposeConvLayer::prepare() +{ + if (!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run weights flipping and mark original weights tensor as unused + _weights_flipped.allocator()->allocate(); + _flip_weights.run(); + _original_weights->mark_as_unused(); + + // Prepare convolution + _conv_f.prepare(); + + // Free flipped weights + if (!_weights_flipped.is_used()) + { + _weights_flipped.allocator()->free(); + } + + _is_prepared = true; + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp index 3d9a28a..ae9d8af 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp @@ -47,7 +47,7 @@ using namespace arm_compute; void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups) { - auto k = arm_compute::support::cpp14::make_unique(); + auto k = support::cpp14::make_unique(); k->configure(input, output, lookups); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp index f098832..0198946 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp @@ -45,7 +45,7 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "support/ToolchainSupport.h" +#include "support/MemorySupport.h" #include @@ -60,7 +60,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I ARM_COMPUTE_UNUSED(weights); ARM_COMPUTE_UNUSED(output); ARM_COMPUTE_RETURN_ON_ERROR( - CLGEMMLowpMatrixMultiplyCoreEx::validate(&input, &weights, nullptr, &output)); + CLGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output)); return Status{}; } @@ -68,7 +68,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I void CLFullyConnectedHybridLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output) { - auto k = arm_compute::support::cpp14::make_unique(); + auto k = support::cpp14::make_unique(); k->configure(input, output); _kernel = std::move(k); } @@ -172,7 +172,8 @@ void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTen // Quantize input _quantized_input.allocator()->init( - input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8)); + input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( + DataType::QASYMM8_SIGNED)); _memory_group.manage(&_quantized_input); _quant_input_kernel.configure(input, &_scale_factor, &_quantized_input); @@ -199,7 +200,7 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::S8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); @@ -256,8 +257,9 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe ARM_COMPUTE_RETURN_ON_ERROR(CLScaleFactorSymm8Kernel::validate(input, &scale_factor)); // Validate quantization symm8 kernel - const ITensorInfo &quantized_input = TensorInfo( - input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8)); + const ITensorInfo &quantized_input = + TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type( + DataType::QASYMM8_SIGNED)); ARM_COMPUTE_RETURN_ON_ERROR( CLQuantizationSymmetricKernel::validate(input, &scale_factor, &quantized_input)); diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp index 63e291b..2ff4b96 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp @@ -46,7 +46,7 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "support/ToolchainSupport.h" +#include "support/MemorySupport.h" #include @@ -141,7 +141,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I void CLFullyConnectedLayerReshapeWeightsEx::configure(const ICLTensor *input, ICLTensor *output) { - auto k = arm_compute::support::cpp14::make_unique(); + auto k = support::cpp14::make_unique(); k->configure(input, output); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp index 9aebc47..157b4d9 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp @@ -53,18 +53,21 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp fc->configure(input_to_use, _weights, _biases, _output); return std::unique_ptr(fc); } - else + else if (kernel_type == KernelType::PREPROCESSED_WEIGHTS) { - assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS); - bool is_hybrid = (input->info()->data_type() == DataType::F32 || input->info()->data_type() == DataType::F16) && - weights->info()->data_type() == DataType::S8; + (weights->info()->data_type() == DataType::S8 || + weights->info()->data_type() == DataType::QASYMM8_SIGNED); if (is_hybrid) { auto fc = new arm_compute::CLFullyConnectedHybridLayer{_memory_manager}; + ITensorInfo *weights_info = const_cast(_weights->info()); + const auto orgin_weights_data_type = weights_info->data_type(); + weights_info->set_data_type(DataType::QASYMM8_SIGNED); fc->configure(input_to_use, _weights, _biases, _output); + weights_info->set_data_type(orgin_weights_data_type); return std::unique_ptr(fc); } else @@ -74,6 +77,11 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp return std::unique_ptr(fc); } } + else + { + throw std::runtime_error("CLFullyConnectedReshapingLayer: Unsupported kernel type"); + } + }(); if (_needs_reshape) diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp deleted file mode 100644 index ca5499d..0000000 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp +++ /dev/null @@ -1,180 +0,0 @@ -/* - * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/MemoryGroup.h" - -namespace arm_compute -{ -using namespace arm_compute::misc::shape_calculator; -using namespace arm_compute::cl_gemm; - -namespace -{ -inline bool is_gemm_reshaped(bool reshape_b_only_on_first_run, GPUTarget gpu_target) -{ - return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (reshape_b_only_on_first_run); -} -} // namespace - -CLGEMMLowpMatrixMultiplyCoreEx::CLGEMMLowpMatrixMultiplyCoreEx( - std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _mm_midgard_kernel(), _mtx_a_reduction_kernel(), - _mtx_b_reduction_kernel(), _vector_sum_col(), _vector_sum_row(), _a_offset(0), _b_offset(0), - _reshape_b_only_on_first_run(false), _is_prepared(false) -{ -} - -void CLGEMMLowpMatrixMultiplyCoreEx::configure(const ICLTensor *a, const ICLTensor *b, - const ICLTensor *c, ICLTensor *output, - const GEMMInfo &gemm_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); - ARM_COMPUTE_UNUSED(c); - ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCoreEx::validate( - a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info)); - - _is_prepared = false; - _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); - _a_offset = a->info()->quantization_info().uniform().offset; - _b_offset = b->info()->quantization_info().uniform().offset; - - // Get the GPU target - const GPUTarget gpu_target = CLScheduler::get().target(); - - // Set the target for the kernels - _mm_midgard_kernel.set_target(gpu_target); - - // GEMMRHSMatrixInfo rhs_info; - // GEMMLHSMatrixInfo lhs_info; - - // Arguments used by GEMMReshapeInfo - // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, - // n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo - // in order to know how the matrices have been reshaped - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d - ? (a->info()->dimension(1) * a->info()->dimension(2)) - : a->info()->dimension(1); - const unsigned int n = b->info()->dimension(0); - const unsigned int k = a->info()->dimension(0); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - - const ICLTensor *matrix_b = b; - // Configure matrix multiply kernel - _mm_midgard_kernel.configure( - a, matrix_b, output, - GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); -} - -Status CLGEMMLowpMatrixMultiplyCoreEx::validate(const ITensorInfo *a, const ITensorInfo *b, - const ITensorInfo *c, const ITensorInfo *output, - const GEMMInfo &gemm_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::S8); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); - ARM_COMPUTE_UNUSED(c); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), - "Matrix A already reshaped is not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), - "Matrix B already reshaped is not supported"); - - const ITensorInfo *matrix_a_info = a; - - // Get the GPU target - const GPUTarget gpu_target = CLScheduler::get().target(); - - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = - reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - - bool reshape_matrix_b = is_gemm_reshaped(gemm_info.reshape_b_only_on_first_run(), gpu_target); - - const GEMMReshapeInfo reshape_info = - GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d); - - TensorInfo weights_info(*b); - const ITensorInfo *matrix_b_info = &weights_info; - if (reshape_matrix_b) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(false, - "CLGEMMLowpMatrixMultiplyCoreEx does not support reshape_b"); - } - - // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernelEx::validate( - matrix_a_info, matrix_b_info, output, reshape_info)); - - return Status{}; -} - -void CLGEMMLowpMatrixMultiplyCoreEx::run() -{ - prepare(); - - MemoryGroupResourceScope scope_mg(_memory_group); - - // Run matrix multiply - CLScheduler::get().enqueue(_mm_midgard_kernel, false); -} - -void CLGEMMLowpMatrixMultiplyCoreEx::prepare() -{ - if (!_is_prepared) - { - _is_prepared = true; - } -} -} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp index f594d7a..e0b833b 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp @@ -48,7 +48,7 @@ using namespace arm_compute; void CLGatherEx::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis) { - auto k = arm_compute::support::cpp14::make_unique(); + auto k = support::cpp14::make_unique(); k->configure(input, indices, output, axis); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp index 27ed8e8..65b89a3 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp @@ -47,7 +47,7 @@ using namespace arm_compute; void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *input, ICLTensor *output, ICLTensor *hits) { - auto k = arm_compute::support::cpp14::make_unique(); + auto k = support::cpp14::make_unique(); k->configure(lookups, keys, input, output, hits); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp index 80393e8..5a7e408 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp @@ -50,7 +50,7 @@ CLInstanceNormalizationLayerEx::CLInstanceNormalizationLayerEx() {} void CLInstanceNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output, ICLTensor *gamma, ICLTensor *beta, float epsilon) { - auto k = arm_compute::support::cpp14::make_unique(); + auto k = support::cpp14::make_unique(); k->configure(input, output, gamma, beta, epsilon); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp deleted file mode 100644 index fbb15ab..0000000 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/CL/functions/CLPReLU.h" - -#include "arm_compute/core/CL/kernels/CLPReLUKernel.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -void CLPReLU::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, alpha, output); - _kernel = std::move(k); - - if (output->info()->dimension(0) > 1) - { - ICLTensor *broadcasted_info = (input->info()->dimension(0) == 1) ? input : alpha; - - if (broadcasted_info->info()->dimension(0) == 1) - { - _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); - } - } -} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp deleted file mode 100644 index 6049b7e..0000000 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/CL/functions/CLRNNLayerEx.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "support/ToolchainSupport.h" - -#include - -using namespace arm_compute; -using namespace arm_compute::misc::shape_calculator; - -CLRNNLayerEx::CLRNNLayerEx(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), - _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), - _gemm_output(), _add_output(), _is_prepared(false) -{ -} - -Status CLRNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *recurrent_weights, const ITensorInfo *bias, - const ITensorInfo *hidden_state, const ITensorInfo *output, - const ActivationLayerInfo &info) -{ - const int idx_width = 0; - const int idx_height = 1; - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, - output); - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width)); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) != - recurrent_weights->dimension(idx_width)); - ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) != - recurrent_weights->dimension(1)); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1); - ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height)); - ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height)); - ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), - hidden_state->tensor_shape()); - - auto shape_info = - TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, - input->data_type()); - - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info)); - ARM_COMPUTE_RETURN_ON_ERROR( - CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f)); - ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate( - ArithmeticOperation::ADD, &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info)); - - return Status{}; -} - -void CLRNNLayerEx::configure(const ICLTensor *input, const ICLTensor *weights, - const ICLTensor *recurrent_weights, const ICLTensor *bias, - ICLTensor *hidden_state, ICLTensor *output, ActivationLayerInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); - ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayerEx::validate(input->info(), weights->info(), - recurrent_weights->info(), bias->info(), - hidden_state->info(), output->info(), info)); - - const int idx_height = 1; - TensorShape shape = - compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height)); - - _is_prepared = false; - - _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); - _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); - - // Manage intermediate buffers and configure - _memory_group.manage(&_fully_connected_out); - _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out); - - _memory_group.manage(&_gemm_output); - _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f); - - _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); - _memory_group.manage(&_add_output); - - _add_kernel.configure(ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output, - &_add_output, ConvertPolicy::SATURATE); - - _fully_connected_out.allocator()->allocate(); - _gemm_output.allocator()->allocate(); - - _activation_kernel.configure(&_add_output, hidden_state, info); - _add_output.allocator()->allocate(); - - _copy_kernel.configure(hidden_state, output); -} - -void CLRNNLayerEx::run() -{ - prepare(); - - _memory_group.acquire(); - - _fully_connected_kernel.run(); - _gemm_state_f.run(); - CLScheduler::get().enqueue(_add_kernel); - CLScheduler::get().enqueue(_activation_kernel); - - // copy hidden out to output - CLScheduler::get().enqueue(_copy_kernel); - - _memory_group.release(); -} - -void CLRNNLayerEx::prepare() -{ - if (!_is_prepared) - { - _fully_connected_kernel.prepare(); - _gemm_state_f.prepare(); - - _is_prepared = true; - } -} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp index 8ce2d74..a41e6db 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp @@ -60,8 +60,7 @@ Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo * const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0); // Create temporary tensor infos - auto interm_tensors = - arm_compute::support::cpp14::make_unique(num_of_interm_tensors); + auto interm_tensors = support::cpp14::make_unique(num_of_interm_tensors); // Create intermediate tensor info TensorShape shape{input->tensor_shape()}; @@ -119,9 +118,8 @@ void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output, const size_t num_of_kernels = axis.size(); const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0); - _interm_tensors = arm_compute::support::cpp14::make_unique(num_of_interm_tensors); - _reduce_kernels = - arm_compute::support::cpp14::make_unique(num_of_kernels); + _interm_tensors = support::cpp14::make_unique(num_of_interm_tensors); + _reduce_kernels = support::cpp14::make_unique(num_of_kernels); // Set a vector that is ordered ICLTensors sequentially. std::vector tensors; diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp deleted file mode 100644 index 7d7b226..0000000 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/CL/functions/CLSpaceToDepth.h" - -#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h" - -using namespace arm_compute; - -void CLSpaceToDepth::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size) -{ - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, block_size); - _kernel = std::move(k); -} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp index e61746e..3215d01 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp @@ -15,7 +15,7 @@ */ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -37,218 +37,124 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ - #include "arm_compute/runtime/CL/functions/CLTransposeConvLayer.h" -#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" -#include "arm_compute/core/Helpers.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/UtilsEx.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/CPP/CPPScheduler.h" +#include #include #include using namespace arm_compute; using namespace arm_compute::misc::shape_calculator; -CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr memory_manager) // NOLINT - : _memory_group(std::move(memory_manager)), - _scale_f(), - _conv_f(), - _flip_weights(), - _scaled_output(), - _original_weights(nullptr), - _weights_flipped(), - _is_prepared(false) +CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr memory_manager) + : _memory_manager(std::move(memory_manager)), _function() +{ +} + +void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, + ICLTensor *output, const PadStrideInfo &deconv_info, + unsigned int invalid_right, unsigned int invalid_bottom, + const WeightsInfo &weights_info) { + configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info, + invalid_right, invalid_bottom, weights_info); +} + +void CLTransposeConvLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, + ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, + const PadStrideInfo &deconv_info, unsigned int invalid_right, + unsigned int invalid_bottom, const WeightsInfo &weights_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + switch (CLTransposeConvLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, + output->info(), deconv_info, invalid_right, + invalid_bottom, weights_info)) + { + case DeconvolutionMethod::DIRECT: + { + auto f = arm_compute::support::cpp14::make_unique(); + f->configure(compile_context, input, weights, bias, output, deconv_info, invalid_right, + invalid_bottom, weights_info); + _function = std::move(f); + break; + } + case DeconvolutionMethod::GEMM: + { + auto f = arm_compute::support::cpp14::make_unique(_memory_manager); + f->configure(compile_context, input, weights, bias, output, deconv_info); + _function = std::move(f); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported."); + break; + } } Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, - const PadStrideInfo &info, unsigned int invalid_right, + const PadStrideInfo &deconv_info, unsigned int invalid_right, unsigned int invalid_bottom, const WeightsInfo &weights_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); - - const DataLayout data_layout = input->data_layout(); - - const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h)); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1); - - const unsigned int kernel_x = weights->dimension(idx_w); - const unsigned int kernel_y = weights->dimension(idx_h); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_right > kernel_x - 1, - "invalid_right must be smaller than kernel_x"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_bottom > kernel_y - 1, - "inner_border_top must be smaller than kernel_y"); - - // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were added. - auto out_dims = transposeconv_output_dimensions( - input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), - weights->dimension(idx_h), info, invalid_right, invalid_bottom); - - const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights); - - if (bias != nullptr) + switch (CLTransposeConvLayer::get_deconvolution_method( + input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info)) { - if (is_data_type_quantized_asymmetric(input->data_type())) + case DeconvolutionMethod::DIRECT: { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + // Validate direct convolution layer + ARM_COMPUTE_RETURN_ON_ERROR(CLDirectTransposeConvLayer::validate( + input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info)); + break; } - else + case DeconvolutionMethod::GEMM: { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + // Validate gemm-based convolution layer + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info)); + break; } - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias); + default: + ARM_COMPUTE_ERROR("Not supported."); + break; } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w], - "Output's width is invalid."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h], - "Output's height is invalid."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c], - "Output's depth is invalid."); - - unsigned int pad_left = 0; - unsigned int pad_right = 0; - unsigned int pad_top = 0; - unsigned int pad_bottom = 0; - const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( - *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top, - pad_bottom); - TensorInfo scale_out_info(input->clone() - ->set_is_resizable(true) - .reset_padding() - .set_tensor_shape(scale_out_shape) - .set_data_layout(data_layout)); - const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); - - ARM_COMPUTE_RETURN_ON_ERROR( - CLTransposeConvLayerUpsample::validate(input, &scale_out_info, BorderSize(0, 0), info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, - conv_info, weights_info)); - return Status{}; } -void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, - ICLTensor *output, const PadStrideInfo &info, - unsigned int invalid_right, unsigned int invalid_bottom, - const WeightsInfo &weights_info) +DeconvolutionMethod CLTransposeConvLayer::get_deconvolution_method( + const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, + ITensorInfo *output, const PadStrideInfo &deconv_info, unsigned int invalid_right, + unsigned int invalid_bottom, const WeightsInfo &weights_info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - - const unsigned int stride_x = info.stride().first; - const unsigned int stride_y = info.stride().second; + ARM_COMPUTE_UNUSED(output, bias, weights_info); - const DataLayout data_layout = input->info()->data_layout(); + const DataLayout data_layout = input->data_layout(); const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - _original_weights = weights; - _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); - _flip_weights.configure(weights, &_weights_flipped); - - // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were - // added. - auto out_dims = transposeconv_output_dimensions( - input->info()->dimension(idx_w), input->info()->dimension(idx_h), - weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right, - invalid_bottom); - - const TensorShape output_shape = - compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); - - // Output auto initialization if not yet initialized - auto_init_if_empty( - *output->info(), - input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout)); - - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayer::validate( - input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), - info, invalid_right, invalid_bottom)); - - _is_prepared = weights_info.retain_internal_weights(); - - _memory_group.manage(&_scaled_output); - - // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order - // to match output shape - unsigned int pad_left = 0; - unsigned int pad_right = 0; - unsigned int pad_top = 0; - unsigned int pad_bottom = 0; - const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( - *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, - pad_right, pad_top, pad_bottom); - - TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), - input->info()->quantization_info()); - scale_out_info.set_data_layout(data_layout); - _scaled_output.allocator()->init(scale_out_info); - - // configure scale function - const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, - DimensionRoundingType::FLOOR); - _scale_f.configure(input, &_scaled_output, BorderSize(0, 0), upsample_info); - - // setup the function to convolve the upscaled output - const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); - _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info); - _scaled_output.allocator()->allocate(); + if (weights->dimension(idx_w) != deconv_info.stride().first || + weights->dimension(idx_h) != deconv_info.stride().second || invalid_right != 0 || + invalid_bottom != 0) + { + return DeconvolutionMethod::DIRECT; + } + + return DeconvolutionMethod::GEMM; } void CLTransposeConvLayer::run() { prepare(); - - _memory_group.acquire(); - - _scale_f.run(); - _conv_f.run(); - - _memory_group.release(); + _function->run(); } -void CLTransposeConvLayer::prepare() -{ - if (!_is_prepared) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - - // Run weights flipping and mark original weights tensor as unused - _weights_flipped.allocator()->allocate(); - _weights_flipped.map(true); - _original_weights->map(CLScheduler::get().queue(), true); - CPPScheduler::get().schedule(&_flip_weights, Window::DimZ); - _weights_flipped.unmap(); - _original_weights->unmap(CLScheduler::get().queue()); - _original_weights->mark_as_unused(); - - // Prepare convolution - _conv_f.prepare(); - - if (!_weights_flipped.is_used()) - { - _weights_flipped.allocator()->free(); - } - - _is_prepared = true; - } -} +void CLTransposeConvLayer::prepare() { _function->prepare(); } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp deleted file mode 100644 index 07feb5a..0000000 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h" - -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/core/CL/ICLTensor.h" - -#include -#include -#include - -using namespace arm_compute; - -CLTransposeConvLayerUpsample::CLTransposeConvLayerUpsample() // NOLINT - : _upsample(), - _output(nullptr) -{ -} - -Status CLTransposeConvLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, - const BorderSize &inner_border, - const PadStrideInfo &info) -{ - return CLTransposeConvLayerUpsampleKernel::validate(input, output, inner_border, info); -} - -void CLTransposeConvLayerUpsample::configure(ICLTensor *input, ICLTensor *output, - const BorderSize &inner_border, - const PadStrideInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - _output = output; - _upsample.configure(input, _output, inner_border, info); -} - -void CLTransposeConvLayerUpsample::run() -{ - _output->map(CLScheduler::get().queue(), true); - if (is_data_type_quantized_asymmetric(_output->info()->data_type())) - { - const uint8_t quantized_zero = _output->info()->quantization_info().uniform().offset; - std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero); - } - else - { - memset(_output->buffer(), 0, _output->info()->total_size()); - } - _output->unmap(CLScheduler::get().queue()); - - CLScheduler::get().enqueue(_upsample, false); -} diff --git a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp index 114e1a7..768c15b 100644 --- a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp +++ b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp @@ -41,14 +41,14 @@ #include "arm_compute/runtime/CPP/functions/CPPOneHotEx.h" #include "arm_compute/core/CPP/kernels/CPPOneHotKernelEx.h" -#include "support/ToolchainSupport.h" +#include "support/MemorySupport.h" using namespace arm_compute; void CPPOneHotEx::configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value, const ITensor *off_value, ITensor *output, const int axis) { - auto k = arm_compute::support::cpp14::make_unique(); + auto k = support::cpp14::make_unique(); k->configure(indices, depth, on_value, off_value, output, axis); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp deleted file mode 100644 index 6c90ef3..0000000 --- a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/CPP/functions/CPPUpsampleEx.h" - -#include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h" -#include "support/ToolchainSupport.h" - -using namespace arm_compute; - -void CPPUpsampleEx::configure(const ITensor *input, ITensor *output, const PadStrideInfo &info) -{ - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, info); - _kernel = std::move(k); -} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp index ff81ff8..2752eb6 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp @@ -42,7 +42,7 @@ #include "arm_compute/core/NEON/kernels/NEActivationLayerKernelEx.h" #include "arm_compute/runtime/IRuntimeContext.h" -#include "support/ToolchainSupport.h" +#include "support/MemorySupport.h" namespace arm_compute { @@ -53,7 +53,7 @@ NEActivationLayerEx::NEActivationLayerEx(IRuntimeContext *ctx) // NOLINT void NEActivationLayerEx::configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info) { - auto k = arm_compute::support::cpp14::make_unique(); + auto k = support::cpp14::make_unique(); k->configure(input, output, activation_info); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp index e42c453..2fc94b2 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp @@ -42,7 +42,7 @@ #include #include "arm_compute/core/ITensor.h" -#include "support/ToolchainSupport.h" +#include "support/MemorySupport.h" #include @@ -53,7 +53,7 @@ template void NEBinaryLogicalOperationStatic::configure(ITensor *input1, ITensor *input2, ITensor *output) { - auto k = arm_compute::support::cpp14::make_unique(); + auto k = support::cpp14::make_unique(); k->configure(COP, input1, input2, output); _kernel = std::move(k); } @@ -69,7 +69,7 @@ Status NEBinaryLogicalOperationStatic::validate(const ITensorInfo *input1, void NEBinaryLogicalOperation::configure(ITensor *input1, ITensor *input2, ITensor *output, BinaryLogicalOperation op) { - auto k = arm_compute::support::cpp14::make_unique(); + auto k = support::cpp14::make_unique(); k->configure(op, input1, input2, output); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp deleted file mode 100644 index dc5c620..0000000 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/NEON/functions/NECast.h" - -#include "arm_compute/core/NEON/kernels/NECastKernel.h" -#include "support/ToolchainSupport.h" - -namespace arm_compute -{ -void NECast::configure(const ITensor *input, ITensor *output, SubDataType input_subtype) -{ - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, input_subtype); - _kernel = std::move(k); -} - -Status NECast::validate(const ITensorInfo *input, const ITensorInfo *output, - SubDataType input_subtype) -{ - return NECastKernel::validate(input, output, input_subtype); -} -} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp deleted file mode 100644 index 5ec0b86..0000000 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" - -namespace arm_compute -{ -void NEDepthToSpaceLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape) -{ - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, block_shape); - _kernel = std::move(k); -} - -Status NEDepthToSpaceLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, - int32_t block_shape) -{ - return NEDepthToSpaceLayerKernelEx::validate(input, output, block_shape); -} -} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp index 53fb150..e0ab3e0 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp @@ -41,13 +41,13 @@ #include "arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h" #include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h" -#include "support/ToolchainSupport.h" +#include "support/MemorySupport.h" using namespace arm_compute; void NEEmbeddingLookup::configure(const ITensor *input, ITensor *output, const ITensor *lookups) { - auto k = arm_compute::support::cpp14::make_unique(); + auto k = support::cpp14::make_unique(); k->configure(input, output, lookups); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp index f457732..a123439 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp @@ -58,7 +58,7 @@ namespace Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output) { ARM_COMPUTE_RETURN_ON_ERROR( - NEGEMMLowpMatrixMultiplyCoreEx::validate(&input, &weights, nullptr, &output)); + NEGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output)); return Status{}; } @@ -66,7 +66,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I void NEFullyConnectedHybridLayerReshapeWeights::configure(const ITensor *input, ITensor *output) { - auto k = arm_compute::support::cpp14::make_unique(); + auto k = support::cpp14::make_unique(); k->configure(input, output); _kernel = std::move(k); } @@ -158,7 +158,8 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor // Quantize input _quantized_input.allocator()->init( - input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8)); + input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( + DataType::QASYMM8_SIGNED)); _scale_factor.allocator()->init( TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32)); _quant_input_kernel.configure(input, &_quantized_input, &_scale_factor); @@ -186,7 +187,7 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::S8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2); @@ -224,8 +225,9 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); // Validate quantization kernel - const ITensorInfo &quantized_input = TensorInfo( - input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8)); + const ITensorInfo &quantized_input = + TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type( + DataType::QASYMM8_SIGNED)); const ITensorInfo &scale_factor = TensorInfo(TensorShape{output->dimension(1)}, 1, DataType::F32); ARM_COMPUTE_RETURN_ON_ERROR( NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor)); diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp index fcac3c7..dc6c784 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp @@ -56,12 +56,17 @@ void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS); bool is_hybrid = input->info()->data_type() == DataType::F32 && - weights->info()->data_type() == DataType::S8; + (weights->info()->data_type() == DataType::S8 || + weights->info()->data_type() == DataType::QASYMM8_SIGNED); if (is_hybrid) { auto fc = new arm_compute::NEFullyConnectedHybridLayer{_memory_manager}; + ITensorInfo *weights_info = const_cast(_weights->info()); + const auto orgin_weights_data_type = weights_info->data_type(); + weights_info->set_data_type(DataType::QASYMM8_SIGNED); fc->configure(input_to_use, _weights, _biases, _output); + weights_info->set_data_type(orgin_weights_data_type); return std::unique_ptr(fc); } else diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp deleted file mode 100644 index 1290cfd..0000000 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp +++ /dev/null @@ -1,513 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" -#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" -#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/TensorAllocator.h" -#include "support/ToolchainSupport.h" - -using namespace arm_compute; -using namespace arm_compute::misc::shape_calculator; - -NEGEMMLowpMatrixMultiplyCoreEx::NEGEMMLowpMatrixMultiplyCoreEx( - std::shared_ptr memory_manager) - : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), - _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), - _mtx_b_reduction_kernel(), _offset_contribution_kernel(), - _offset_contribution_output_stage_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), - _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0), - _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false), - _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), - _fuse_output_stage(false), _flip_signedness(false) -{ -} - -void NEGEMMLowpMatrixMultiplyCoreEx::configure(const ITensor *a, const ITensor *b, const ITensor *c, - ITensor *output, const GEMMInfo &gemm_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); - ARM_COMPUTE_UNUSED(c); - ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCoreEx::validate( - a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info)); - - const ITensor *matrix_a = a; - const ITensor *matrix_b = b; - GEMMInfo info = gemm_info; - - // Clear state - _mtx_a_reshape_kernel = nullptr; - _mtx_b_reshape_kernel = nullptr; - - // Set internal variables - _a_offset = a->info()->quantization_info().uniform().offset; - _b_offset = b->info()->quantization_info().uniform().offset; - _run_vector_matrix_multiplication = a->info()->dimension(1) < 2; - _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run(); - _is_prepared = false; - _fused_assembly_path = false; - _original_b = b; - - const ITensor *a_to_use = a; - - // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage - if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) - { - _fuse_output_stage = true; - _memory_group.manage(&_mm_result_s32); - TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32); - _mm_result_s32.allocator()->init(info_mm_result_s32); - } - -#ifdef __aarch64__ - switch (a->info()->data_type()) - { - case DataType::QASYMM8: - case DataType::QASYMM8_SIGNED: - case DataType::U8: - case DataType::S8: - { - if (a_to_use->info()->data_type() == DataType::QASYMM8 && - info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) - { - _asm_glue.configure(a_to_use, b, c, output, gemm_info); - _fused_assembly_path = _asm_glue.is_configured(); - } - else - { - _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, - gemm_info); - } - _assembly_path = _asm_glue.is_configured(); - break; - } - default: - { - ARM_COMPUTE_ERROR("Datatype not supported"); - break; - } - } -#endif /* __aarch64__ */ - if (!(_assembly_path || _run_vector_matrix_multiplication)) - { - matrix_a = &_tmp_a; - matrix_b = &_tmp_b; - - // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / - // 4.0f) ] - TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1, - a_to_use->info()->data_type(), a_to_use->info()->quantization_info()); - // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / - // 16.0f) ] - TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), - b->info()->quantization_info()); - _tmp_a.allocator()->init(a_info); - _tmp_b.allocator()->init(b_info); - _memory_group.manage(&_tmp_a); - if (!_reshape_b_only_on_first_run) - { - _memory_group.manage(&_tmp_b); - } - - // Configure interleave kernel - { - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(a_to_use, &_tmp_a); - _mtx_a_reshape_kernel = std::move(k); - } - - // Configure transpose kernel - { - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(b, &_tmp_b); - _mtx_b_reshape_kernel = std::move(k); - } - } - - if (!_fused_assembly_path) - { - // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 - if (_a_offset != 0) - { - TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32); - - _vector_sum_col.allocator()->init(info_vector_sum_col); - if (!_reshape_b_only_on_first_run) - { - _memory_group.manage(&_vector_sum_col); - } - - // Configure Matrix B reduction kernel - _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a_to_use->info()->dimension(0), false); - } - - // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 - if (_b_offset != 0) - { - TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32); - - _vector_sum_row.allocator()->init(info_vector_sum_row); - _memory_group.manage(&_vector_sum_row); - - // Configure matrix A reduction kernel - _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, a_to_use->info()->dimension(0), - false); - } - - if (_fuse_output_stage) - { - // Configure matrix multiply kernel - if (!_assembly_path) - { - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(matrix_a, matrix_b, &_mm_result_s32); - _mm_kernel = std::move(k); - } - - _offset_contribution_output_stage_kernel.configure( - &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, - _b_offset == 0 ? nullptr : &_vector_sum_row, c, - _flip_signedness ? &_signed_output : output, a->info()->dimension(0), _a_offset, - _b_offset, info.gemmlowp_output_stage()); - } - else - { - // Configure matrix multiply kernel - if (!_assembly_path) - { - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(matrix_a, matrix_b, output); - _mm_kernel = std::move(k); - } - // Configure offset contribution kernel - _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, - _b_offset == 0 ? nullptr : &_vector_sum_row, - a_to_use->info()->dimension(0), _a_offset, _b_offset); - } - } - - // Allocate tensors - if (!_assembly_path && !_run_vector_matrix_multiplication) - { - _tmp_a.allocator()->allocate(); - if (!_reshape_b_only_on_first_run) - { - _tmp_b.allocator()->allocate(); - } - } - - if (!_fused_assembly_path) - { - if (_a_offset != 0 && !_reshape_b_only_on_first_run) - { - _vector_sum_col.allocator()->allocate(); - } - - if (_b_offset != 0) - { - _vector_sum_row.allocator()->allocate(); - } - } - - if (_fuse_output_stage) - { - _mm_result_s32.allocator()->allocate(); - } -} - -Status NEGEMMLowpMatrixMultiplyCoreEx::validate(const ITensorInfo *a, const ITensorInfo *b, - const ITensorInfo *c, const ITensorInfo *output, - const GEMMInfo &gemm_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::S8); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::S8); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, - "Bias addition not supported in NEGEMMLowpMatrixMultiplyCoreEx for output S32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1), - "The product AB is defined only if the number of columns in A is " - "equal to the number of rows in B"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), - "Matrix A already reshaped is not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), - "Matrix B already reshaped is not supported"); - - GEMMInfo info = gemm_info; - const ITensorInfo *matrix_a_info = a; - const ITensorInfo *matrix_b_info = b; - - const ITensorInfo *a_to_use = a; - - TensorInfo tmp_a_info{}; - TensorInfo tmp_b_info{}; - TensorInfo mm_result_s32_info{}; - - int32_t a_offset = a->quantization_info().uniform().offset; - int32_t b_offset = b->quantization_info().uniform().offset; - - bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE; - if (fuse_output_stage) - { - auto_init_if_empty( - mm_result_s32_info, - a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32)); - } - - // Check if we need to run the optimized assembly kernel - bool run_optimised = false; - bool run_optimised_requantized = false; - if (a_to_use->data_type() == DataType::QASYMM8 && - info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) - { - run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info)); - run_optimised_requantized = run_optimised; - } - else - { - run_optimised = bool(NEGEMMAssemblyDispatch::validate( - a_to_use, b, c, fuse_output_stage ? &mm_result_s32_info : output, gemm_info)); - } - - if (run_optimised) - { - ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0)); - if (info.depth_output_gemm3d() != 0) - { - if (info.reinterpret_input_as_3d()) - { - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2)); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2)); - } - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); - } - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), - "NEGEMM cannot reinterpret the input tensor as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, - "NEGEMM cannot reinterpret the output tensor as 3D"); - - const bool run_vector_matrix_multiplication = a->dimension(1) < 2; - if (!run_vector_matrix_multiplication) - { - matrix_a_info = &tmp_a_info; - matrix_b_info = &tmp_b_info; - - // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / - // 4.0f) ] - TensorShape shape_tmp_a = a->tensor_shape(); - shape_tmp_a.set(0, a->dimension(0) * 4); - shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f)); - - // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width - // / 16.0f) ] - TensorShape shape_tmp_b = b->tensor_shape(); - shape_tmp_b.set(0, b->dimension(1) * 16); - shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f)); - - // Validate interleave kernel - auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a)); - auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b)); - - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info)); - } - } - - if (!run_optimised_requantized) - { - TensorInfo info_vector_sum_col{}; - TensorInfo info_vector_sum_row{}; - - // Validate matrix B reduction kernel only if _a_offset is not equal to 0 - if (a_offset != 0) - { - info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); - - // Configure Matrix B reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate( - b, &info_vector_sum_col, a->dimension(0), false)); - } - - // Validate Matrix A reduction kernel only if _b_offset is not equal to 0 - if (b_offset != 0) - { - info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); - - // Configure matrix A reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate( - a_to_use, &info_vector_sum_row, a->dimension(0), false)); - } - - if (fuse_output_stage) - { - if (!run_optimised) - { - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate( - matrix_a_info, matrix_b_info, &mm_result_s32_info)); - } - - // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate( - &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, c, output, a_offset, b_offset, - info.gemmlowp_output_stage())); - } - else - { - if (!run_optimised) - { - ARM_COMPUTE_RETURN_ON_ERROR( - NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output)); - } - // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate( - output, a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, a_offset, b_offset)); - } - } - return Status{}; -} - -void NEGEMMLowpMatrixMultiplyCoreEx::run() -{ - prepare(); - - MemoryGroupResourceScope scope_mg(_memory_group); - - // Reshape inputs - if (_mtx_a_reshape_kernel) - { - NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY); - } - if (_mtx_b_reshape_kernel && !_reshape_b_only_on_first_run) - { - NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); - } - - // Run GEMM - if (_asm_glue.is_configured()) - { - _asm_glue.run(); - } - else - { - NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY); - } - - if (!_fused_assembly_path) - { - // Run matrix A reduction kernel only if _b_offset is not equal to 0 - if (_b_offset != 0) - { - NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX); - } - - // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if (_a_offset != 0 && !_reshape_b_only_on_first_run) - { - NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX); - } - - if (_fuse_output_stage) - { - // Run offset contribution kernel - NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY); - } - else - { - // Run offset contribution kernel - NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY); - } - } -} - -void NEGEMMLowpMatrixMultiplyCoreEx::prepare() -{ - if (!_is_prepared) - { - // Run assembly reshape - if (_asm_glue.is_configured() && _reshape_b_only_on_first_run) - { - ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); - - _asm_glue.prepare(); - _original_b->mark_as_unused(); - } - // Run non-assembly reshape - else if (_mtx_b_reshape_kernel && _reshape_b_only_on_first_run) - { - ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); - - // Run reshape kernel and mark original weights tensor as unused - _tmp_b.allocator()->allocate(); - NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); - _original_b->mark_as_unused(); - } - - // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if (_a_offset != 0 && _reshape_b_only_on_first_run) - { - _vector_sum_col.allocator()->allocate(); - NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX); - } - - _is_prepared = true; - } -} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp index c8bb88a..433c35d 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp @@ -41,7 +41,7 @@ #include "arm_compute/runtime/NEON/functions/NEGatherEx.h" #include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h" -#include "support/ToolchainSupport.h" +#include "support/MemorySupport.h" #include @@ -49,7 +49,7 @@ namespace arm_compute { void NEGatherEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis) { - auto k = arm_compute::support::cpp14::make_unique(); + auto k = support::cpp14::make_unique(); k->configure(input, indices, output, axis); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp index 078019f..52d58ac 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp @@ -41,14 +41,14 @@ #include "arm_compute/runtime/NEON/functions/NEHashtableLookup.h" #include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h" -#include "support/ToolchainSupport.h" +#include "support/MemorySupport.h" using namespace arm_compute; void NEHashtableLookup::configure(const ITensor *lookups, const ITensor *keys, const ITensor *input, ITensor *output, ITensor *hits) { - auto k = arm_compute::support::cpp14::make_unique(); + auto k = support::cpp14::make_unique(); k->configure(lookups, keys, input, output, hits); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp deleted file mode 100644 index dac3b84..0000000 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2018-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/NEON/functions/NEPReLU.h" - -#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h" -#include "support/ToolchainSupport.h" - -#include - -using namespace arm_compute; - -void NEPReLU::configure(const ITensor *input, const ITensor *alpha, ITensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, alpha, output); - _kernel = std::move(k); -} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp deleted file mode 100644 index 0e9a5e9..0000000 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp +++ /dev/null @@ -1,161 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2018-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/NEON/functions/NERNNLayerEx.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - -namespace arm_compute -{ -NERNNLayerEx::NERNNLayerEx(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), - _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), - _gemm_output(), _add_output(), _is_prepared(false) -{ -} - -Status NERNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *recurrent_weights, const ITensorInfo *bias, - const ITensorInfo *hidden_state, const ITensorInfo *output, - const ActivationLayerInfo &info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, - output); - - const int idx_width = 0; - const int idx_height = 1; - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width)); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) != - recurrent_weights->dimension(idx_width)); - ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) != - recurrent_weights->dimension(idx_height)); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1); - ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height)); - ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height)); - ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), - hidden_state->tensor_shape()); - - auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape( - recurrent_weights, hidden_state->dimension(idx_height)), - 1, input->data_type()); - - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate( - &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&shape_info, &shape_info, info)); - - return Status{}; -} - -void NERNNLayerEx::configure(const ITensor *input, const ITensor *weights, - const ITensor *recurrent_weights, const ITensor *bias, - ITensor *hidden_state, ITensor *output, ActivationLayerInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); - ARM_COMPUTE_ERROR_THROW_ON(NERNNLayerEx::validate(input->info(), weights->info(), - recurrent_weights->info(), bias->info(), - hidden_state->info(), output->info(), info)); - - const int idx_height = 1; - TensorShape shape = misc::shape_calculator::compute_rnn_shape( - recurrent_weights->info(), hidden_state->info()->dimension(idx_height)); - - _is_prepared = false; - - // Manage intermediate buffers and configure - _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); - _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); - - // Manage intermediate buffers and configure - _memory_group.manage(&_fully_connected_out); - _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out); - - _memory_group.manage(&_gemm_output); - _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f); - - _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); - _memory_group.manage(&_add_output); - - _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output, - ConvertPolicy::SATURATE); - - _fully_connected_out.allocator()->allocate(); - _gemm_output.allocator()->allocate(); - - _activation_kernel.configure(&_add_output, hidden_state, info); - _add_output.allocator()->allocate(); - - _copy_kernel.configure(hidden_state, output); -} - -void NERNNLayerEx::run() -{ - prepare(); - - MemoryGroupResourceScope scope_mg(_memory_group); - - _fully_connected_kernel.run(); - - _gemm_state_f.run(); - - NEScheduler::get().schedule(&_add_kernel, Window::DimY); - NEScheduler::get().schedule(&_activation_kernel, Window::DimY); - - // copy hidden out to output - NEScheduler::get().schedule(&_copy_kernel, Window::DimY); -} - -void NERNNLayerEx::prepare() -{ - if (!_is_prepared) - { - _fully_connected_kernel.prepare(); - _gemm_state_f.prepare(); - - _is_prepared = true; - } -} -} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp deleted file mode 100644 index 116bba3..0000000 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp +++ /dev/null @@ -1,180 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2018-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/NEON/functions/NEReduceMeanEx.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - -using namespace arm_compute; - -NEReduceMeanEx::NEReduceMeanEx(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), - _reduction_ops(), _keep_dims() -{ -} - -Status NEReduceMeanEx::validate(const ITensorInfo *input, const Coordinates &reduction_axis, - bool keep_dims, const ITensorInfo *output) -{ - ARM_COMPUTE_UNUSED(keep_dims); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); - ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); - - TensorShape out_shape = input->tensor_shape(); - const unsigned int reduction_ops = reduction_axis.num_dimensions(); - const int input_dims = input->num_dimensions(); - Coordinates axis_local = reduction_axis; - - // Convert negative axis - for (unsigned int i = 0; i < reduction_ops; ++i) - { - axis_local[i] = wrap_around(axis_local[i], input_dims); - } - - std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); - for (unsigned int i = 0; i < reduction_ops; ++i) - { - ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); - ARM_COMPUTE_RETURN_ERROR_ON(static_cast(axis_local[i]) > - input->num_dimensions() - 1); - if (output->total_size() > 0 && keep_dims) - { - ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); - } - if (keep_dims) - { - out_shape.set(axis_local[i], 1); - } - else - { - out_shape.remove_dimension(axis_local[i] - i); - } - } - const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); - - return Status{}; -} - -void NEReduceMeanEx::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, - ITensor *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input); - - _reduction_ops = reduction_axis.num_dimensions(); - _reduction_kernels = - arm_compute::support::cpp14::make_unique(_reduction_ops); - _reduced_outs = - arm_compute::support::cpp14::make_unique(_reduction_ops - (keep_dims ? 1 : 0)); - _keep_dims = keep_dims; - - Coordinates axis_local = reduction_axis; - const int input_dims = input->info()->num_dimensions(); - const unsigned int reduction_ops = reduction_axis.num_dimensions(); - - // Convert negative axis - for (unsigned int i = 0; i < reduction_ops; ++i) - { - axis_local[i] = wrap_around(axis_local[i], input_dims); - } - - // Perform reduction for every axis - for (unsigned int i = 0; i < _reduction_ops; ++i) - { - TensorShape out_shape = i == 0 ? input->info()->tensor_shape() - : (_reduced_outs.get() + i - 1)->info()->tensor_shape(); - out_shape.set(axis_local[i], 1); - auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1); - - if (i == _reduction_ops - 1 && keep_dims) - { - _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM); - } - else - { - _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), - input->info()->data_type(), - input->info()->quantization_info()) - .set_data_layout(output->info()->data_layout())); - _memory_group.manage(_reduced_outs.get() + i); - _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i], - ReductionOperation::MEAN_SUM); - } - } - - // Allocate intermediate tensors - for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) - { - _reduced_outs[i].allocator()->allocate(); - } - - // Configure reshape layer if we want to drop the dimensions - if (!keep_dims) - { - TensorShape out_shape = input->info()->tensor_shape(); - - // We have to sort the reduction axis vectors in order for remove_dimension - // to work properly - std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); - for (unsigned int i = 0; i < _reduction_ops; ++i) - { - out_shape.remove_dimension(axis_local[i] - i); - } - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape)); - _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output); - } -} - -void NEReduceMeanEx::run() -{ - _memory_group.acquire(); - - for (unsigned int i = 0; i < _reduction_ops; ++i) - { - _reduction_kernels[i].run(); - } - - if (!_keep_dims) - { - _reshape.run(); - } - _memory_group.release(); -} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp deleted file mode 100644 index 198bb76..0000000 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - -namespace arm_compute -{ -NESpaceToBatchLayerEx::NESpaceToBatchLayerEx() - : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false) -{ -} - -void NESpaceToBatchLayerEx::configure(const ITensor *input, const ITensor *block_shape, - const ITensor *paddings, ITensor *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output); - - if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) - { - _has_padding = true; - _memset_kernel.configure( - output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info())); - } - _space_to_batch_kernel.configure(input, block_shape, paddings, output); -} - -void NESpaceToBatchLayerEx::configure(const ITensor *input, const int block_shape_x, - const int block_shape_y, const Size2D &padding_left, - const Size2D &padding_right, ITensor *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) - { - _has_padding = true; - _memset_kernel.configure( - output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info())); - } - _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, - output); -} - -Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const ITensorInfo *block_shape, - const ITensorInfo *paddings, const ITensorInfo *output) -{ - ARM_COMPUTE_RETURN_ON_ERROR( - NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output)); - - return Status{}; -} - -Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const int block_shape_x, - const int block_shape_y, const Size2D &padding_left, - const Size2D &padding_right, const ITensorInfo *output) -{ - ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate( - input, block_shape_x, block_shape_y, padding_left, padding_right, output)); - - return Status{}; -} - -void NESpaceToBatchLayerEx::run() -{ - // Zero out output only if we have paddings - if (_has_padding) - { - NEScheduler::get().schedule(&_memset_kernel, Window::DimY); - } - NEScheduler::get().schedule(&_space_to_batch_kernel, Window::DimY); -} -} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp deleted file mode 100644 index 97697e3..0000000 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" - -namespace arm_compute -{ -void NESpaceToDepthLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape) -{ - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, block_shape); - _kernel = std::move(k); -} - -Status NESpaceToDepthLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, - int32_t block_shape) -{ - ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToDepthLayerKernelEx::validate(input, output, block_shape)); - return Status{}; -} -} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp index df06892..09f1780 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp @@ -1,21 +1,5 @@ /* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -37,14 +21,11 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ - #include "arm_compute/runtime/NEON/functions/NETransposeConvLayer.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Utils.h" #include "arm_compute/core/UtilsEx.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" #include "arm_compute/runtime/NEON/NEScheduler.h" @@ -52,20 +33,15 @@ using namespace arm_compute::misc::shape_calculator; namespace arm_compute { + NETransposeConvLayer::NETransposeConvLayer(std::shared_ptr memory_manager) // NOLINT : _memory_group(std::move(memory_manager)), _conv_f(), _upsample_f(), _flip_weights(), - _permute_input(), - _permute_weights(), - _permute_output(), _scaled_output(), _weights_flipped(), - _permuted_input(), - _permuted_weights(), - _permuted_output(), - _is_nchw(false), + _flip_axis(), _original_weights(nullptr), _input(nullptr), _info(), @@ -80,7 +56,7 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, - DataType::QASYMM8); + DataType::QASYMM8, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, input); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input); const unsigned int width_idx = @@ -95,13 +71,16 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf weights->dimension(height_idx), info, invalid_right, invalid_bottom); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); - if (is_data_type_quantized_asymmetric(input->data_type()) && bias) + if (bias != nullptr) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); - } - else if (bias) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + if (is_data_type_quantized_asymmetric(input->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + } } if (output->tensor_shape().total_size() > 0) @@ -110,12 +89,12 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) < output_shape.x(), - "Output's dim 0 is invalid."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) < output_shape.y(), - "Output's dim 1 is invalid."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) < output_shape.z(), - "Output's dim 2 is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), + "Output's width is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), + "Output's height is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), + "Output's depth is invalid."); } unsigned int pad_left = 0; @@ -127,7 +106,6 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf pad_bottom); TensorInfo scale_out_info( input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape)); - scale_out_info.set_data_layout(input->data_layout()); const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); const unsigned int batches_idx = @@ -149,19 +127,13 @@ void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, con ITensor *output, const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom) { + // Perform validation step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate( + input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), + info, invalid_right, invalid_bottom)); const DataLayout data_layout = input->info()->data_layout(); - - _input = input; - _original_weights = weights; - _info = info; - _is_prepared = false; - _is_nchw = data_layout == DataLayout::NCHW; - - const unsigned int stride_x = info.stride().first; - const unsigned int stride_y = info.stride().second; - const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const unsigned int height_idx = @@ -173,101 +145,54 @@ void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, con const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); + + _input = input; + _original_weights = weights; + _info = info; + _is_prepared = false; + + unsigned int pad_left = 0; + unsigned int pad_right = 0; + unsigned int pad_top = 0; + unsigned int pad_bottom = 0; + const unsigned int stride_x = info.stride().first; + const unsigned int stride_y = info.stride().second; + // Output auto initialization if not yet initialized auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info()); - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate( - input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), - info, invalid_right, invalid_bottom)); - + _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32)); _memory_group.manage(&_scaled_output); - if (!_is_nchw) - { - _memory_group.manage(&_permuted_input); - _memory_group.manage(&_permuted_weights); - _memory_group.manage(&_permuted_output); - - // Configure the function to transform the input tensor from NHWC -> NCHW - _permuted_input.info()->set_quantization_info(input->info()->quantization_info()); - _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U)); - _permuted_input.info()->set_data_layout(DataLayout::NCHW); - - // Configure the function to transform the weights tensor from NHWC -> NCHW - _permuted_weights.info()->set_quantization_info(weights->info()->quantization_info()); - _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U)); - _permuted_weights.info()->set_data_layout(DataLayout::NCHW); - - // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in - // order to match output shape - - unsigned int pad_left = 0; - unsigned int pad_right = 0; - unsigned int pad_top = 0; - unsigned int pad_bottom = 0; - const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( - *_permuted_input.info(), *_permuted_weights.info(), info, out_dims, invalid_right, - invalid_bottom, pad_left, pad_right, pad_top, pad_bottom); - - TensorInfo scale_out_info(scale_out_shape, 1, _permuted_input.info()->data_type(), - _permuted_input.info()->quantization_info()); - scale_out_info.set_data_layout(DataLayout::NCHW); - _scaled_output.allocator()->init(scale_out_info); - - const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, - DimensionRoundingType::CEIL); - _upsample_f.configure(&_permuted_input, &_scaled_output, upsample_info); - - _weights_flipped.allocator()->init(*_permuted_weights.info()->clone()); - _weights_flipped.info()->set_quantization_info(weights->info()->quantization_info()); - _flip_weights.configure(&_permuted_weights, &_weights_flipped); - - // setup the function to convolve the upscaled output - const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); - - const auto out_shape = output->info()->tensor_shape(); - TensorShape permuted_out_shape{out_shape[1], out_shape[2], out_shape[0], out_shape[3]}; - TensorInfo permuted_out_info(permuted_out_shape, 1, output->info()->data_type(), - output->info()->quantization_info()); - _permuted_output.allocator()->init(permuted_out_info); - _permuted_output.info()->set_data_layout(DataLayout::NCHW); - _conv_f.configure(&_scaled_output, &_weights_flipped, bias, &_permuted_output, conv_info); - - // Configure the function to transform the convoluted output to NHWC - _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U)); - - _permuted_input.allocator()->allocate(); - _permuted_weights.allocator()->allocate(); - _permuted_output.allocator()->allocate(); - } - else - { - // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in - // order to match output shape - unsigned int pad_left = 0; - unsigned int pad_right = 0; - unsigned int pad_top = 0; - unsigned int pad_bottom = 0; - const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( - *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, - pad_right, pad_top, pad_bottom); - - TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), - input->info()->quantization_info()); - _scaled_output.allocator()->init(scale_out_info); - const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, - DimensionRoundingType::FLOOR); - _upsample_f.configure(input, &_scaled_output, upsample_info); - - _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); - _flip_weights.configure(weights, &_weights_flipped); - - // setup the function to convolve the upscaled output - const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); - _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info); - } + _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); + _flip_weights.configure(weights, &_weights_flipped, &_flip_axis); + + // setup the function to convolve the upscaled output + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + + const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( + *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, + pad_right, pad_top, pad_bottom); + + const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, + DimensionRoundingType::FLOOR); + + TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + scale_out_info.set_data_layout(data_layout); + _scaled_output.allocator()->init(scale_out_info); + + _upsample_f.configure(input, &_scaled_output, upsample_info); + + _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info); + + // Setup flip axis data + _flip_axis.allocator()->allocate(); + auto axis_data = reinterpret_cast(_flip_axis.buffer()); + axis_data[0] = static_cast(width_idx); + axis_data[1] = static_cast(height_idx); + _scaled_output.allocator()->allocate(); } @@ -275,22 +200,10 @@ void NETransposeConvLayer::run() { prepare(); - // MemoryGroupResourceScope scope_mg(_memory_group); - - // Permute input - if (!_is_nchw) - { - _permute_input.run(); - } + MemoryGroupResourceScope scope_mg(_memory_group); _upsample_f.run(); _conv_f.run(); - - // Permute output - if (!_is_nchw) - { - _permute_output.run(); - } } void NETransposeConvLayer::prepare() @@ -301,22 +214,12 @@ void NETransposeConvLayer::prepare() // Run weights flipping and mark original weights tensor as unused _weights_flipped.allocator()->allocate(); - // Permute weights - if (!_is_nchw) - { - _permute_weights.run(); - } - NEScheduler::get().schedule(&_flip_weights, Window::DimZ); + _flip_weights.run(); _original_weights->mark_as_unused(); // Prepare convolution _conv_f.prepare(); - if (!_weights_flipped.is_used()) - { - _weights_flipped.allocator()->free(); - } - _is_prepared = true; } } diff --git a/compute/cker/CMakeLists.txt b/compute/cker/CMakeLists.txt index 09f6725..609dd45 100644 --- a/compute/cker/CMakeLists.txt +++ b/compute/cker/CMakeLists.txt @@ -8,6 +8,9 @@ target_link_libraries(nnfw_lib_cker INTERFACE gemmlowp) target_link_libraries(nnfw_lib_cker INTERFACE ruy) target_link_libraries(nnfw_lib_cker INTERFACE ruy_instrumentation) target_compile_definitions(nnfw_lib_cker INTERFACE USE_RUY_GEMV) +if(EXPERIMENTAL_RUY_FEATURE) + target_compile_definitions(nnfw_lib_cker INTERFACE EXPERIMENTAL_RUY_FEATURE) +endif(EXPERIMENTAL_RUY_FEATURE) if(PROFILE_RUY) target_link_libraries(nnfw_lib_cker INTERFACE ruy_profiler) endif(PROFILE_RUY) diff --git a/compute/cker/include/cker/Types.h b/compute/cker/include/cker/Types.h index 41b1916..1bde640 100644 --- a/compute/cker/include/cker/Types.h +++ b/compute/cker/include/cker/Types.h @@ -259,6 +259,12 @@ struct FullyConnectedParams // FullyConnectedWeightsFormat weights_format; }; +struct L2NormParams +{ + // uint8 inference params. + int32_t input_zero_point; +}; + struct GatherParams { int32_t axis; @@ -338,6 +344,11 @@ struct SpaceToBatchParams int32_t output_offset; }; +struct SpaceToDepthParams +{ + int32_t block_size; +}; + enum class Order { kColMajor, diff --git a/compute/cker/include/cker/Utils.h b/compute/cker/include/cker/Utils.h index b69d55c..2abb998 100644 --- a/compute/cker/include/cker/Utils.h +++ b/compute/cker/include/cker/Utils.h @@ -123,6 +123,68 @@ inline int CountLeadingZeros(uint32_t integer_input) return leading_zeros; } +inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift, + int32_t *output_inv_sqrt, int *output_shift) +{ + assert(input >= 0); + if (input <= 1) + { + // Handle the input value 1 separately to avoid overflow in that case + // in the general computation below (b/143972021). Also handle 0 as if it + // were a 1. 0 is an invalid input here (divide by zero) and 1 is a valid + // but rare/unrealistic input value. We can expect both to occur in some + // incompletely trained models, but probably not in fully trained models. + *output_inv_sqrt = std::numeric_limits::max(); + *output_shift = 0; + return; + } + assert(input > 1); + *output_shift = 11; + while (input >= (1 << 29)) + { + input /= 4; + ++*output_shift; + } + const unsigned max_left_shift_bits = CountLeadingZeros(static_cast(input)) - 1; + const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2; + const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1; + *output_shift -= left_shift_bit_pairs; + input <<= 2 * left_shift_bit_pairs; + assert(input >= (1 << 27)); + assert(input < (1 << 29)); + using gemmlowp::FixedPoint; + using gemmlowp::Rescale; + using gemmlowp::SaturatingRoundingMultiplyByPOT; + // Using 3 integer bits gives us enough room for the internal arithmetic in + // this Newton-Raphson iteration. + using F3 = FixedPoint; + using F0 = FixedPoint; + const F3 fixedpoint_input = F3::FromRaw(input >> 1); + const F3 fixedpoint_half_input = SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input); + const F3 fixedpoint_half_three = + GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5); + // Newton-Raphson iteration + // Naive unoptimized starting guess: x = 1 + F3 x = F3::One(); + // Naive unoptimized number of iterations: 5 + for (int i = 0; i < 5; i++) + { + const F3 x3 = Rescale<3>(x * x * x); + x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3); + } + const F0 fixedpoint_half_sqrt_2 = + GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.); + x = x * fixedpoint_half_sqrt_2; + *output_inv_sqrt = x.raw(); + if (*output_shift < 0) + { + *output_inv_sqrt <<= -*output_shift; + *output_shift = 0; + } + // Convert right shift (right is positive) to left shift. + *output_shift *= reverse_shift; +} + // Comment from tensorflow lite: // // DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING diff --git a/compute/cker/include/cker/operation/FullyConnected.h b/compute/cker/include/cker/operation/FullyConnected.h index 9bcf3fd..9b72811 100644 --- a/compute/cker/include/cker/operation/FullyConnected.h +++ b/compute/cker/include/cker/operation/FullyConnected.h @@ -78,8 +78,11 @@ inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &inpu MatrixBatchVectorMultiplyAccumulate(weights_data, num_units, input_size, input_data, batch_size, output_data, /*result_stride=*/1); - // Apply activation function - ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data); + if (params.activation != FusedActivationFunctionType::kNone) + { + // Apply activation function + ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data); + } } inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &input_shape, @@ -195,7 +198,11 @@ inline void FullyConnectedHybrid(const FullyConnectedParams ¶ms, const Shape #endif // Apply activation function to floats. - ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data); + if (params.activation != FusedActivationFunctionType::kNone) + { + // Apply activation function + ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data); + } return; } diff --git a/compute/cker/include/cker/operation/L2Normalize.h b/compute/cker/include/cker/operation/L2Normalize.h new file mode 100644 index 0000000..a0075c3 --- /dev/null +++ b/compute/cker/include/cker/operation/L2Normalize.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_L2NORMALIZE_H__ +#define __NNFW_CKER_L2NORMALIZE_H__ + +#include "cker/Shape.h" +#include "cker/Utils.h" +#include "cker/Types.h" + +namespace nnfw +{ +namespace cker +{ + +void L2NormalizeFloat32(const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ + float epsilon = 1e-6; + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); + for (int i = 0; i < outer_size; ++i) + { + float squared_l2_norm = 0; + for (int c = 0; c < depth; ++c) + { + const float val = input_data[c]; + squared_l2_norm += val * val; + } + float l2_norm = std::sqrt(squared_l2_norm); + l2_norm = std::max(l2_norm, epsilon); + for (int c = 0; c < depth; ++c) + { + *output_data = *input_data / l2_norm; + ++output_data; + ++input_data; + } + } +} + +void L2NormalizeQuant8(L2NormParams ¶ms, const Shape &input_shape, const uint8_t *input_data, + const Shape &output_shape, uint8_t *output_data) +{ + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); + const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int32_t input_zero_point = params.input_zero_point; + + for (int i = 0; i < outer_size; ++i) + { + int32_t square_l2_norm = 0; + for (int c = 0; c < depth; c++) + { + // Note that input_data advances by depth in the second pass below. + int32_t diff = input_data[c] - input_zero_point; + square_l2_norm += diff * diff; + } + int32_t inv_l2norm_multiplier; + int inv_l2norm_shift; + GetInvSqrtQuantizedMultiplierExp(square_l2_norm, -1, &inv_l2norm_multiplier, &inv_l2norm_shift); + for (int c = 0; c < depth; c++) + { + int32_t diff = *input_data - input_zero_point; + int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp( + 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift); + int32_t unclamped_output_val = 128 + rescaled_diff; + int32_t output_val = std::min(static_cast(255), + std::max(static_cast(0), unclamped_output_val)); + *output_data = static_cast(output_val); + ++input_data; + ++output_data; + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_L2NORMALIZE_H__ diff --git a/compute/cker/include/cker/operation/Logistic.h b/compute/cker/include/cker/operation/Logistic.h index 7477858..3d3e59e 100644 --- a/compute/cker/include/cker/operation/Logistic.h +++ b/compute/cker/include/cker/operation/Logistic.h @@ -32,18 +32,9 @@ namespace cker inline void Logistic(const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data) { -#ifdef __aarch64__ auto input_map = MapAsVector(input_data, input_shape); auto output_map = MapAsVector(output_data, output_shape); output_map.array() = input_map.array().unaryExpr(Eigen::internal::scalar_logistic_op()); -#else - // Note, this can be done using TANH: (1/2) + (1/2) * TANH(x/2) - const int size = MatchingFlatSize(input_shape, output_shape); - for (int i = 0; i < size; i++) - { - output_data[i] = 1.f / (1.f + std::exp(-input_data[i])); - } -#endif } } // namespace cker diff --git a/compute/cker/include/cker/operation/Pad.h b/compute/cker/include/cker/operation/Pad.h index af432f3..4a2732d 100644 --- a/compute/cker/include/cker/operation/Pad.h +++ b/compute/cker/include/cker/operation/Pad.h @@ -26,9 +26,10 @@ namespace nnfw { namespace cker { +template inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &input_shape, - const float *input_data, const Shape &output_shape, float *output_data, - const float *constant_value_data) + const T *input_data, const Shape &output_shape, T *output_data, + const T *constant_value_data) { // Note, this is pad with mode=`CONSTANT`: it doesn't support `REFLECT` and `SYMMETRIC` // TODO: come up with more subtle solution that uses subtensors like arm compute @@ -38,7 +39,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu /** List of padding information */ using PaddingList = std::vector; - auto constant_value = constant_value_data ? *constant_value_data : 0; + const T constant_value = constant_value_data ? *constant_value_data : 0; assert(output_shape.DimensionsCount() == input_shape.DimensionsCount()); PaddingList padding_list(pad_rank); @@ -64,7 +65,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu { const int32_t in_row_len = input_shape.Dims(0); std::fill_n(output_data, padding_list[0].first, constant_value); - std::memcpy(output_data + padding_list[0].first, input_data, in_row_len * sizeof(float)); + std::memcpy(output_data + padding_list[0].first, input_data, in_row_len * sizeof(T)); std::fill_n(output_data + padding_list[0].first + in_row_len, padding_list[0].second, constant_value); break; @@ -89,7 +90,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu out_offset += padding_list[1].first; // copy a row of input data - memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(float)); + memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(T)); out_offset += in_row_len; @@ -132,7 +133,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu out_offset += padding_list[2].first; // copy a row of input data - memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(float)); + memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(T)); out_offset += in_row_len; @@ -191,7 +192,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu out_c_offset += padding_list[3].first; // copy a row of input data - memcpy(output_data + out_c_offset, input_data + in_offset, in_row_len * sizeof(float)); + memcpy(output_data + out_c_offset, input_data + in_offset, in_row_len * sizeof(T)); out_c_offset += in_row_len; diff --git a/compute/cker/include/cker/operation/Quantize.h b/compute/cker/include/cker/operation/Quantize.h new file mode 100644 index 0000000..5c82d11 --- /dev/null +++ b/compute/cker/include/cker/operation/Quantize.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_QUANTIZE_H__ +#define __NNFW_CKER_QUANTIZE_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" +#include +#include +namespace nnfw +{ +namespace cker +{ +template +inline void Quantize(const Shape &input_shape, const InputT *input_data, const Shape &output_shape, + OutputT *output_data, const float output_scale, const int32_t output_offset) +{ + const int flat_size = MatchingFlatSize(input_shape, output_shape); + int min_val = std::numeric_limits::min(); + int max_val = std::numeric_limits::max(); + + for (int i = 0; i < flat_size; i++) + { + int32_t unclamped = static_cast(round(input_data[i] / output_scale)) + output_offset; + int32_t clamped = std::min(std::max(unclamped, min_val), max_val); + output_data[i] = clamped; + } +} +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_QUANTIZE_H__ diff --git a/compute/cker/include/cker/operation/SpaceToDepth.h b/compute/cker/include/cker/operation/SpaceToDepth.h new file mode 100644 index 0000000..ef67931 --- /dev/null +++ b/compute/cker/include/cker/operation/SpaceToDepth.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_SPACE_TO_DEPTH_H__ +#define __NNFW_CKER_SPACE_TO_DEPTH_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" + +namespace nnfw +{ +namespace cker +{ + +template +inline void SpaceToDepth(const SpaceToDepthParams ¶ms, const Shape &unextended_input_shape, + const T *input_data, const Shape &unextended_output_shape, T *output_data) +{ + assert(unextended_input_shape.DimensionsCount() <= 4); + assert(unextended_output_shape.DimensionsCount() <= 4); + const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape); + const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape); + + const int output_depth = output_shape.Dims(3); + const int output_width = output_shape.Dims(2); + const int output_height = output_shape.Dims(1); + + const int input_depth = input_shape.Dims(3); + const int batch_size = input_shape.Dims(0); + + // Number of continuous values that we can copy in one interation. + const int stride = params.block_size * input_depth; + + for (int batch = 0; batch < batch_size; ++batch) + { + for (int out_h = 0; out_h < output_height; ++out_h) + { + T *output_ptr = output_data + Offset(output_shape, batch, out_h, 0, 0); + for (int offset_h = 0; offset_h < params.block_size; ++offset_h) + { + T *dst = output_ptr; + for (int out_w = 0; out_w < output_width; ++out_w) + { + memcpy(dst, input_data, stride * sizeof(T)); + input_data += stride; + dst += output_depth; + } + output_ptr += stride; + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_SPACE_TO_DEPTH_H__ diff --git a/compute/cker/include/cker/ruy/RuySupport.h b/compute/cker/include/cker/ruy/RuySupport.h index 432b181..080f66f 100644 --- a/compute/cker/include/cker/ruy/RuySupport.h +++ b/compute/cker/include/cker/ruy/RuySupport.h @@ -24,7 +24,7 @@ namespace { -const int kDefaultNumThreadpoolThreads = 4; +const int kDefaultNumThreadpoolThreads = 1; } namespace nnfw diff --git a/docs/howto/how-to-build-runtime.md b/docs/howto/how-to-build-runtime.md index 2bfd14c..657f0f7 100644 --- a/docs/howto/how-to-build-runtime.md +++ b/docs/howto/how-to-build-runtime.md @@ -13,7 +13,7 @@ In the Ubuntu, you can easily install it with the following command. ``` $ sudo apt-get install cmake libboost-all-dev -``` +``` If your linux system does not have the basic development configuration, you will need to install more packages. A list of all packages needed to configure the development environment can be found in the https://github.com/Samsung/ONE/blob/master/infra/docker/Dockerfile.1804 file. @@ -44,7 +44,7 @@ python3-venv \ scons \ software-properties-common \ unzip \ -wget +wget $ mkdir /tmp/gtest $ cd /tmp/gtest @@ -63,7 +63,7 @@ In a typical linux development environment, including Ubuntu, you can build the ``` $ git clone https://github.com/Samsung/ONE.git one $ cd one -$ cp -n Makefile.template Makefile; make install +$ make -f Makefile.template install ``` Unfortunately, the debug build on the x86_64 architecture currently has an error. To solve the problem, you must use gcc version 9 or higher. Another workaround is to do a release build rather than a debug build. This is not a suitable method for debugging during development, but it is enough to check the function of the runtime. To release build the runtime, add the environment variable `BUILD_TYPE=release` to the build command as follows. diff --git a/docs/nnfw/howto/CrossBuildForAndroid.md b/docs/nnfw/howto/CrossBuildForAndroid.md index d7e48c8..08d5fd6 100644 --- a/docs/nnfw/howto/CrossBuildForAndroid.md +++ b/docs/nnfw/howto/CrossBuildForAndroid.md @@ -44,11 +44,9 @@ Different from cross build for linux, Here is an example of using Makefile. ```bash -cp -n Makefile.template Makefile - TARGET_OS=android \ CROSS_BUILD=1 \ NDK_DIR=/path/android-tools/r20/ndk \ EXT_ACL_FOLDER=/path/arm_compute-v19.11.1-bin-android/lib/android-arm64-v8a-neon-cl \ -make install +make -f Makefile.template install ``` diff --git a/docs/runtime/core.md b/docs/runtime/core.md index 42ba75f..64a6c62 100644 --- a/docs/runtime/core.md +++ b/docs/runtime/core.md @@ -68,7 +68,7 @@ Let's say we have some functions written in a certain programming language. Then With generated tensors and kernels, the compiler creates executor objects. There are 3 types of executors are supported - Linear, Dataflow, and Parallel. Linear executor is the default executor and Dataflow Executor and Parallel Executor are experimental. -For more about executors, please refer to [Executors](./executors.md) document. +For more about executors, please refer to [Executors](executors.md) document. ### Module `exec` @@ -83,4 +83,4 @@ For more about executors, please refer to [Executors](./executors.md) document. Backends are plugins and they are loaded dynamically(via `dlopen`). So this module is a set of interface classes for backend implementation. `compiler` can compile with a variety of backends without knowing specific backend implementation. -Backend interface classes are mostly about memory management and kernel generation. For more, please refer to [Backend API](./backend-api.md) document. +Backend interface classes are mostly about memory management and kernel generation. For more, please refer to [Backend API](backend-api.md) document. diff --git a/docs/runtime/heterogeneous-execution.md b/docs/runtime/heterogeneous-execution.md index dc39dae..e7a5e27 100644 --- a/docs/runtime/heterogeneous-execution.md +++ b/docs/runtime/heterogeneous-execution.md @@ -12,11 +12,11 @@ Here is another case. Let's say we have a model that is not sequential so there ![Add-3Conv model](heterogeneous-execution-add-3-conv-model.png) -Say we have 3 backends that are based on CPU, GPU and NPU(Neural Processing Unit) respectively. After executing Add, 3 Conv2D operations are ready to run. We may utilize those backends with [Parallel Executor (experimental)](./executors.md#parallel-executor-experimental). For this case we may get performance gain regardless of kernels' speed as those are run in parallel independently. +Say we have 3 backends that are based on CPU, GPU and NPU(Neural Processing Unit) respectively. After executing Add, 3 Conv2D operations are ready to run. We may utilize those backends with [Parallel Executor (experimental)](executors.md#parallel-executor-experimental). For this case we may get performance gain regardless of kernels' speed as those are run in parallel independently. ## Graph Transformation -Unfortunately it is not that simple to get performance gain. As each backend has its own memory management module, a copy must be done between backend boundaries. Plus, it may require layout changes so "Permute" operations are added from `PermutationInsertionPass`. This process is done from [Lowering](./core.md#1-lowering) phase of compilation. +Unfortunately it is not that simple to get performance gain. As each backend has its own memory management module, a copy must be done between backend boundaries. Plus, it may require layout changes so "Permute" operations are added from `PermutationInsertionPass`. This process is done from [Lowering](core.md#1-lowering) phase of compilation. Here is an example of that. Let's say we have assigned different backends for Add and Conv2D. So a Permute operation is inserted between them. diff --git a/infra/cmake/packages/ARMComputeSourceConfig.cmake b/infra/cmake/packages/ARMComputeSourceConfig.cmake index 51a235a..adec1f9 100644 --- a/infra/cmake/packages/ARMComputeSourceConfig.cmake +++ b/infra/cmake/packages/ARMComputeSourceConfig.cmake @@ -8,7 +8,7 @@ function(_ARMComputeSource_import) nnas_include(OptionTools) envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com") - set(ARMCOMPUTE_URL ${EXTERNAL_DOWNLOAD_SERVER}/ARM-software/ComputeLibrary/archive/v19.11.1.tar.gz) + set(ARMCOMPUTE_URL ${EXTERNAL_DOWNLOAD_SERVER}/ARM-software/ComputeLibrary/archive/v20.05.tar.gz) ExternalSource_Download(ARMCOMPUTE ${ARMCOMPUTE_URL}) set(ARMComputeSource_DIR ${ARMCOMPUTE_SOURCE_DIR} PARENT_SCOPE) diff --git a/infra/cmake/packages/FlatBuffersConfig.cmake b/infra/cmake/packages/FlatBuffersConfig.cmake index ab0b770..da084e7 100644 --- a/infra/cmake/packages/FlatBuffersConfig.cmake +++ b/infra/cmake/packages/FlatBuffersConfig.cmake @@ -25,7 +25,8 @@ function(_FlatBuffers_build) BUILD_DIR ${CMAKE_BINARY_DIR}/externals/FLATBUFFERS/build INSTALL_DIR ${EXT_OVERLAY_DIR} BUILD_FLAGS ${ADDITIONAL_CXX_FLAGS} - IDENTIFIER "1.10-fix1" + IDENTIFIER "1.10-fix2" + EXTRA_OPTS "-DFLATBUFFERS_BUILD_TESTS:BOOL=OFF" PKG_NAME "FLATBUFFERS") endfunction(_FlatBuffers_build) diff --git a/infra/cmake/packages/HDF5Config.cmake b/infra/cmake/packages/HDF5Config.cmake index e282e0b..19803f1 100644 --- a/infra/cmake/packages/HDF5Config.cmake +++ b/infra/cmake/packages/HDF5Config.cmake @@ -27,6 +27,7 @@ _HDF5_build() find_path(HDF5_CONFIG_DIR "hdf5-config.cmake" PATHS ${EXT_OVERLAY_DIR} PATH_SUFFIXES + cmake share/cmake share/cmake/hdf5 cmake/hdf5 diff --git a/infra/cmake/packages/Pybind11Config.cmake b/infra/cmake/packages/Pybind11Config.cmake new file mode 100644 index 0000000..3061779 --- /dev/null +++ b/infra/cmake/packages/Pybind11Config.cmake @@ -0,0 +1,21 @@ +function(_Pybind11_import) + nnas_find_package(Pybind11Source QUIET) + + if(NOT Pybind11Source_FOUND) + set(Pybind11_FOUND FALSE PARENT_SCOPE) + return() + endif(NOT Pybind11Source_FOUND) + + nnas_include(ExternalBuildTools) + ExternalBuild_CMake(CMAKE_DIR ${Pybind11Source_DIR} + BUILD_DIR ${CMAKE_BINARY_DIR}/externals/PYBIND11/build + INSTALL_DIR ${EXT_OVERLAY_DIR} + IDENTIFIER "2.3.0" + PKG_NAME "PYBIND11") + + find_path(Pybind11_INCLUDE_DIRS NAMES pybind11.h PATHS ${EXT_OVERLAY_DIR} PATH_SUFFIXES include/pybind11) + + set(Pybind11_FOUND TRUE PARENT_SCOPE) +endfunction(_Pybind11_import) + +_Pybind11_import() diff --git a/infra/cmake/packages/Pybind11SourceConfig.cmake b/infra/cmake/packages/Pybind11SourceConfig.cmake new file mode 100644 index 0000000..4a9c676 --- /dev/null +++ b/infra/cmake/packages/Pybind11SourceConfig.cmake @@ -0,0 +1,18 @@ +function(_Pybind11Source_import) + if(NOT DOWNLOAD_PYBIND11) + set(Pybind11Source_FOUND FALSE PARENT_SCOPE) + return() + endif(NOT DOWNLOAD_PYBIND11) + + nnas_include(ExternalSourceTools) + nnas_include(OptionTools) + + envoption(PYBIND11_URL https://github.com/pybind/pybind11/archive/v2.3.0.tar.gz) + + ExternalSource_Download(PYBIND11 ${PYBIND11_URL}) + + set(Pybind11Source_DIR ${PYBIND11_SOURCE_DIR} PARENT_SCOPE) + set(Pybind11Source_FOUND TRUE PARENT_SCOPE) +endfunction(_Pybind11Source_import) + +_Pybind11Source_import() diff --git a/infra/docker/Dockerfile b/infra/docker/Dockerfile index e675b53..052cc4f 100644 --- a/infra/docker/Dockerfile +++ b/infra/docker/Dockerfile @@ -1,8 +1,6 @@ FROM ubuntu:16.04 ARG UBUNTU_MIRROR -ENV http_proxy $http_proxy -ENV https_proxy $https_proxy RUN if [ -n "$http_proxy" ] ; then echo "Acquire::http::proxy \"${http_proxy}\";" >> /etc/apt/apt.conf ; fi RUN if [ -n "$https_proxy" ] ; then echo "Acquire::https::proxy \"${https_proxy}\";" >> /etc/apt/apt.conf ; fi @@ -22,6 +20,7 @@ RUN apt-get update && apt-get -qqy install libprotobuf-dev protobuf-compiler # Additonal tools RUN apt-get update && apt-get -qqy install doxygen graphviz wget unzip clang-format-3.9 python3 python3-pip python3-venv hdf5-tools pylint +RUN pip3 install --upgrade pip RUN pip3 install yapf==0.22.0 numpy # Install google test (source) diff --git a/infra/docker/Dockerfile.1804 b/infra/docker/Dockerfile.1804 index fc6fc9a..cc31bba 100644 --- a/infra/docker/Dockerfile.1804 +++ b/infra/docker/Dockerfile.1804 @@ -1,12 +1,6 @@ FROM ubuntu:18.04 ARG UBUNTU_MIRROR -ENV http_proxy $http_proxy -ENV https_proxy $https_proxy - -RUN if [ -n "$http_proxy" ] ; then echo "Acquire::http::proxy \"${http_proxy}\";" >> /etc/apt/apt.conf ; fi -RUN if [ -n "$https_proxy" ] ; then echo "Acquire::https::proxy \"${https_proxy}\";" >> /etc/apt/apt.conf ; fi -RUN if [ -n "$UBUNTU_MIRROR" ] ; then sed "s/archive.ubuntu.com/${UBUNTU_MIRROR}/g" -i /etc/apt/sources.list ; fi # Install 'add-apt-repository' RUN apt-get update && apt-get -qqy install software-properties-common @@ -22,6 +16,7 @@ RUN apt-get update && apt-get -qqy install libprotobuf-dev protobuf-compiler # Additonal tools RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -qqy install doxygen graphviz wget unzip clang-format-3.9 python3 python3-pip python3-venv hdf5-tools pylint +RUN pip3 install --upgrade pip RUN pip3 install yapf==0.22.0 numpy # Install google test (source) diff --git a/infra/nncc/CMakeLists.txt b/infra/nncc/CMakeLists.txt index 3ac6680..0be6885 100644 --- a/infra/nncc/CMakeLists.txt +++ b/infra/nncc/CMakeLists.txt @@ -98,6 +98,7 @@ option(DOWNLOAD_CAFFE "Download Caffe source" ON) option(DOWNLOAD_PYTORCH "Download Pytorch source" ON) option(DOWNLOAD_ONNX "Download ONNX source" ON) option(DOWNLOAD_ABSEIL "Download Abseil-cpp source" ON) +option(DOWNLOAD_PYBIND11 "Download Pybind11 source" ON) option(DOWNLOAD_GTEST "Download Google Test source" ON) option(BUILD_GTEST "Build Google Test from the downloaded source" ON) diff --git a/infra/nncc/command/utcount b/infra/nncc/command/utcount index d4610e3..d06c5c9 100644 --- a/infra/nncc/command/utcount +++ b/infra/nncc/command/utcount @@ -13,7 +13,7 @@ BUILD_ITEMS="angkor cwrap pepper-str pepper-strcast pp stdex \ oops pepper-assert \ hermes hermes-std \ loco locop locomotiv logo-core logo \ -foder souschef arser \ +foder souschef arser vconone \ safemain mio-circle mio-tflite \ tflite2circle \ luci \ diff --git a/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt b/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt index 8e7f78e..2442a2d 100644 --- a/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt +++ b/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt @@ -100,7 +100,7 @@ target_include_directories(tensorflow-lite-2.2.0 SYSTEM PUBLIC ${TFLITE_INCLUDES target_compile_definitions(tensorflow-lite-2.2.0 PUBLIC "GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK -DTFLITE_WITH_RUY -DTFLITE_WITH_RUY_GEMV") set_property(TARGET tensorflow-lite-2.2.0 PROPERTY POSITION_INDEPENDENT_CODE ON) target_link_libraries(tensorflow-lite-2.2.0 eigen ${LIB_PTHREAD} dl) -if(${BUILD_WITH_NNAPI}) +if(NOT ANDROID AND ${BUILD_WITH_NNAPI}) target_link_libraries(tensorflow-lite-2.2.0 rt) endif() diff --git a/infra/nnfw/config/gbs.conf b/infra/nnfw/config/gbs.conf index 515cada..bad9eb2 100644 --- a/infra/nnfw/config/gbs.conf +++ b/infra/nnfw/config/gbs.conf @@ -5,7 +5,7 @@ profile = profile.tizen [profile.tizen] user=obs_viewer obs = obs.tizen -repos = repo.tizen_base,repo.tizen_mobile +repos = repo.tizen_one,repo.tizen_base,repo.tizen_mobile buildroot = /home/GBS-ROOT/ [obs.tizen] @@ -15,6 +15,8 @@ url = http://api.tizen.org url = http://download.tizen.org/snapshots/tizen/unified/latest/repos/standard/packages/ [repo.tizen_base] -url = http://download.tizen.org/snapshots/tizen/base/latest/repos/standard/packages/ +url = http://download.tizen.org/snapshots/tizen/base/latest/repos/standard/packages/ +[repo.tizen_one] +url = http://nnfw.mooo.com/archive/tizen/ diff --git a/infra/packaging/preset/20200630 b/infra/packaging/preset/20200630 index e159935..c3ca4b6 100644 --- a/infra/packaging/preset/20200630 +++ b/infra/packaging/preset/20200630 @@ -14,6 +14,7 @@ function preset_configure() REQUIRED_UNITS+=("souschef") REQUIRED_UNITS+=("safemain") REQUIRED_UNITS+=("arser") + REQUIRED_UNITS+=("vconone") # Hermes Logging Framework REQUIRED_UNITS+=("hermes" "hermes-std") # loco IR and related utilities @@ -28,11 +29,14 @@ function preset_configure() REQUIRED_UNITS+=("record-minmax" "circle-quantizer") REQUIRED_UNITS+=("one-cmds") + NPROC=${NPROC:-$(cat /proc/cpuinfo | grep -c processor)} + # TODO Use "nncc configure" and "nncc build" cmake \ -DCMAKE_INSTALL_PREFIX="${NNCC_INSTALL_PREFIX}" \ -DCMAKE_BUILD_TYPE=release \ -DBUILD_WHITELIST=$(join_by ";" "${REQUIRED_UNITS[@]}") \ + -DEXTERNALS_BUILD_THREADS=$((NPROC/2)) \ ${EXTRA_OPTIONS[@]} \ "${NNAS_PROJECT_PATH}/infra/nncc" } @@ -44,14 +48,4 @@ function preset_install() # Install tf2nnpkg install -T -m 755 -D "${SCRIPT_PATH}/res/tf2nnpkg.${PRESET}" "${NNAS_INSTALL_PREFIX}/bin/tf2nnpkg" - - # Create python virtual enviornment - python3 -m venv "${NNAS_INSTALL_PREFIX}/bin/venv" - - # Install tensorflow - source "${NNAS_INSTALL_PREFIX}/bin/venv/bin/activate" - python -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \ - install -U pip setuptools - python -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \ - install tensorflow-cpu==2.3.0rc0 } diff --git a/infra/packaging/res/tf2nnpkg.20200630 b/infra/packaging/res/tf2nnpkg.20200630 index 9101f82..7846fd3 100644 --- a/infra/packaging/res/tf2nnpkg.20200630 +++ b/infra/packaging/res/tf2nnpkg.20200630 @@ -14,10 +14,16 @@ command_exists() { usage() { echo "Convert TensorFlow model to nnpackage." - echo "Usage: tf2nnpkg --info --graphdef [OPTION] -o " - exit 0 + echo "Usage: tf2nnpkg" + echo " --info " + echo " --graphdef " + echo " -o " + echo " --v2 (optional) Use TF 2.x interface" + exit 255 } +TF_INTERFACE="--v1" + # Parse command-line arguments # while [ "$#" -ne 0 ]; do @@ -39,6 +45,10 @@ while [ "$#" -ne 0 ]; do export OUTPUT_DIR="$2" shift 2 ;; + '--v2') + TF_INTERFACE="--v2" + shift + ;; *) echo "${CUR}" shift @@ -83,10 +93,7 @@ OUTPUT=$(awk -F, '/^output/ { print $2 }' ${INFO_FILE} | cut -d: -f1 | tr -d ' ' INPUT_SHAPES=$(grep ^input ${INFO_FILE} | cut -d "[" -f2 | cut -d "]" -f1 | tr -d ' ' | xargs | tr ' ' ':') # generate tflite file -python "${ROOT}/bin/tf2tfliteV2.py" --v2 --input_path ${GRAPHDEF_FILE} \ ---output_path "${TMPDIR}/${MODEL_NAME}.tflite" \ ---input_arrays ${INPUT} --output_arrays ${OUTPUT} || \ -python "${ROOT}/bin/tf2tfliteV2.py" --v1 --input_path ${GRAPHDEF_FILE} \ +python "${ROOT}/bin/tf2tfliteV2.py" ${TF_INTERFACE} --input_path ${GRAPHDEF_FILE} \ --output_path "${TMPDIR}/${MODEL_NAME}.tflite" \ --input_arrays ${INPUT} --input_shapes ${INPUT_SHAPES} \ --output_arrays ${OUTPUT} diff --git a/infra/scripts/build-tcm.sh b/infra/scripts/build-tcm.sh new file mode 100644 index 0000000..22fb335 --- /dev/null +++ b/infra/scripts/build-tcm.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# +# STEP 1 +# Download latest TCM tool from +# https://github.sec.samsung.net/RS-TCM/tca-standalone/releases/download/v0.0.8/tca-standalone-0.0.8.jar +# +# STEP 2 +# Create symbolic link `./src` for source directory to be analyzed which has `.ahub` configuration. +# +# STEP 3 +# run this `build-tcm.sh` script. +# +# See the following link for additional details. +# https://github.sec.samsung.net/RS-TCM/tca-standalone/wiki/Tutorials-CPP-Gtest +# + +echo ${PROJECT_DIR:=${PWD}} + +java -jar $PROJECT_DIR/tca-standalone-0.0.8.jar \ + --outdir=$PROJECT_DIR/tcm-output \ + --config=$PROJECT_DIR/.ahub/tcchecker-tca/config.yaml \ + --local=$PROJECT_DIR/src \ + --logfile=$PROJECT_DIR/tcm-output/tcm.log \ + --debug diff --git a/infra/scripts/compiler_modules.sh b/infra/scripts/compiler_modules.sh index d436e8a..a0323e0 100644 --- a/infra/scripts/compiler_modules.sh +++ b/infra/scripts/compiler_modules.sh @@ -7,7 +7,7 @@ DEBUG_BUILD_ITEMS="angkor;cwrap;pepper-str;pepper-strcast;pp;stdex" DEBUG_BUILD_ITEMS+=";oops;pepper-assert" DEBUG_BUILD_ITEMS+=";hermes;hermes-std" DEBUG_BUILD_ITEMS+=";loco;locop;locomotiv;logo-core;logo" -DEBUG_BUILD_ITEMS+=";foder;souschef;arser" +DEBUG_BUILD_ITEMS+=";foder;souschef;arser;vconone" DEBUG_BUILD_ITEMS+=";safemain;mio-circle;mio-tflite" DEBUG_BUILD_ITEMS+=";tflite2circle" DEBUG_BUILD_ITEMS+=";luci" diff --git a/infra/scripts/docker_build_cross_aarch64_runtime.sh b/infra/scripts/docker_build_cross_aarch64_runtime.sh index 7da6736..011d14c 100755 --- a/infra/scripts/docker_build_cross_aarch64_runtime.sh +++ b/infra/scripts/docker_build_cross_aarch64_runtime.sh @@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" ROOT_PATH="$CURRENT_PATH/../../" # prepare rootfs -if [ ! -d $ROOTFS_DIR ]; then +if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then echo "It will use default rootfs path" else DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs" diff --git a/infra/scripts/docker_build_cross_arm_runtime.sh b/infra/scripts/docker_build_cross_arm_runtime.sh index f1f666a..551fb57 100755 --- a/infra/scripts/docker_build_cross_arm_runtime.sh +++ b/infra/scripts/docker_build_cross_arm_runtime.sh @@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" ROOT_PATH="$CURRENT_PATH/../../" # prepare rootfs -if [ ! -d $ROOTFS_DIR ]; then +if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then echo "It will use default rootfs path" else DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs" diff --git a/infra/scripts/docker_build_cross_arm_runtime_release.sh b/infra/scripts/docker_build_cross_arm_runtime_release.sh index ea66f17..876f318 100755 --- a/infra/scripts/docker_build_cross_arm_runtime_release.sh +++ b/infra/scripts/docker_build_cross_arm_runtime_release.sh @@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" ROOT_PATH="$CURRENT_PATH/../../" # prepare rootfs -if [ ! -d $ROOTFS_DIR ]; then +if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then echo "It will use default rootfs path" else DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs" diff --git a/infra/scripts/docker_build_cross_coverage.sh b/infra/scripts/docker_build_cross_coverage.sh index 08244e5..f42251b 100755 --- a/infra/scripts/docker_build_cross_coverage.sh +++ b/infra/scripts/docker_build_cross_coverage.sh @@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" ROOT_PATH="$CURRENT_PATH/../../" # prepare rootfs -if [ ! -d $ROOTFS_DIR ]; then +if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then echo "It will use default rootfs path" else DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs" diff --git a/infra/scripts/docker_build_nncc.sh b/infra/scripts/docker_build_nncc.sh index 418b50d..5b12531 100755 --- a/infra/scripts/docker_build_nncc.sh +++ b/infra/scripts/docker_build_nncc.sh @@ -54,6 +54,16 @@ pushd $ROOT_PATH > /dev/null mkdir -p ${NNCC_INSTALL_PREFIX} ./nncc docker-run ./nnas create-package --prefix "${PWD}/${NNCC_INSTALL_PREFIX}" -- "${CONFIG_OPTIONS}" +# create python virtual environment +./nncc docker-run python3 -m venv "${NNCC_INSTALL_PREFIX}/bin/venv" + +./nncc docker-run "${NNCC_INSTALL_PREFIX}/bin/venv/bin/python" \ + -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \ + install -U pip setuptools +./nncc docker-run "${NNCC_INSTALL_PREFIX}/bin/venv/bin/python" \ + -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \ + install tensorflow-cpu==2.3.0rc0 + mkdir -p ${ARCHIVE_PATH} tar -zcf ${ARCHIVE_PATH}/nncc-package.tar.gz -C ${NNCC_INSTALL_PREFIX} ./ diff --git a/infra/scripts/docker_build_tizen_cross.sh b/infra/scripts/docker_build_tizen_cross.sh index 18809ad..ee0f183 100755 --- a/infra/scripts/docker_build_tizen_cross.sh +++ b/infra/scripts/docker_build_tizen_cross.sh @@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" ROOT_PATH="$CURRENT_PATH/../../" # prepare rootfs -if [ ! -d $ROOTFS_DIR ]; then +if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then echo "It will use default rootfs path" else DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs" diff --git a/infra/scripts/docker_collect_nnpkg_resources.sh b/infra/scripts/docker_collect_nnpkg_resources.sh index 556c5bd..55adaa1 100755 --- a/infra/scripts/docker_collect_nnpkg_resources.sh +++ b/infra/scripts/docker_collect_nnpkg_resources.sh @@ -60,7 +60,7 @@ pushd $ROOT_PATH > /dev/null REQUIRED_UNITS=() # Common Libraries REQUIRED_UNITS+=("angkor" "cwrap" "pepper-str" "pepper-strcast" "pp" "stdex") -REQUIRED_UNITS+=("oops" "safemain" "foder" "arser" "oops") +REQUIRED_UNITS+=("oops" "safemain" "foder" "arser" "vconone") # Hermes Logging Framework REQUIRED_UNITS+=("hermes" "hermes-std") # loco IR and related utilities diff --git a/infra/scripts/tizen_xu4_test.sh b/infra/scripts/tizen_xu4_test.sh index 5521b5f..640a0e0 100755 --- a/infra/scripts/tizen_xu4_test.sh +++ b/infra/scripts/tizen_xu4_test.sh @@ -23,7 +23,7 @@ function install_model() { # download tflite model files pushd $HOST_HOME - tests/scripts/framework/run_test.sh --download=on + tests/scripts/framework/run_test.sh --download=on --run=off # TODO Since this command removes model file(.zip), # We must always download the file unlike model file(.tflite). # Because caching applies only to tflite file. diff --git a/master_diff_1.7.0.patch b/master_diff_1.7.0.patch new file mode 100644 index 0000000..feae398 --- /dev/null +++ b/master_diff_1.7.0.patch @@ -0,0 +1,30424 @@ +diff --git a/.ahub/tcchecker-tca/config.yaml b/.ahub/tcchecker-tca/config.yaml +new file mode 100644 +index 0000000..cd34d79 +--- /dev/null ++++ b/.ahub/tcchecker-tca/config.yaml +@@ -0,0 +1,43 @@ ++version: 2 ++test: ++ - name: NN Runtime ++ testCaseLanguage: CPP ++ testFW: GTEST ++ testCaseFolder: ++ - ./compute/test/cker ++ - ./runtime/onert/core/src/backend/cpu_common ++ - ./runtime/onert/frontend/nnapi ++ - ./runtime/onert/test/core/compiler ++ - ./runtime/onert/test/core/exec ++ - ./runtime/onert/test/core/interp ++ - ./runtime/onert/test/graph ++ - ./runtime/onert/test/graph/operand ++ - ./runtime/onert/test/graph/operation ++ - ./runtime/onert/test/graph/verifier ++ - ./runtime/onert/test/ir ++ - ./runtime/onert/test/util ++ - ./tests/nnapi/src ++ - ./tests/nnfw_api/src ++ - ./tests/tools/tflite_run/src ++ ++ testFile: ++ - extension: cpp ++ any: true ++ - extension: cc ++ any: true ++ ++ testCase: ++ - condition: ++ - functionName: ++ starts: ++ - TEST ++ ++ negativeTestCase: ++ - condition: ++ - testName: ++ starts: ++ - neg_ ++ ++ positiveTestCase: ++ - condition: ++ - inverse: negativeTestCase +diff --git a/compiler/.ahub/tcchecker-tca/config.yaml b/compiler/.ahub/tcchecker-tca/config.yaml +new file mode 100644 +index 0000000..ef681de +--- /dev/null ++++ b/compiler/.ahub/tcchecker-tca/config.yaml +@@ -0,0 +1,54 @@ ++version: 2 ++test: ++ - name: NN Compiler ++ testCaseLanguage: CPP ++ testFW: GTEST ++ testCaseFolder: ++ - ./angkor ++ - ./arser ++ - ./circle2circle ++ - ./circle-quantizer ++ - ./cwrap ++ - ./foder ++ - ./hermes ++ - ./hermes-std ++ - ./loco ++ - ./locomotiv ++ - ./locop ++ - ./logo ++ - ./logo-core ++ - ./luci ++ - ./luci-interpreter ++ - ./luci-value-test ++ - ./mio-circle ++ - ./mio-tflite ++ - ./oops ++ - ./pepper-assert ++ - ./pepper-str ++ - ./pepper-strcast ++ - ./pp ++ - ./record-minmax ++ - ./safemain ++ - ./souschef ++ - ./stdex ++ - ./tflite2circle ++ ++ testFile: ++ - extension: .test.cpp ++ any: true ++ ++ testCase: ++ - condition: ++ - functionName: ++ starts: ++ - TEST ++ ++ negativeTestCase: ++ - condition: ++ - testName: ++ ends: ++ - _NEG ++ ++ positiveTestCase: ++ - condition: ++ - inverse: negativeTestCase +diff --git a/compiler/bcq-tools/CMakeLists.txt b/compiler/bcq-tools/CMakeLists.txt +new file mode 100644 +index 0000000..ae231bd +--- /dev/null ++++ b/compiler/bcq-tools/CMakeLists.txt +@@ -0,0 +1,27 @@ ++set(BCQ_TOOLS_FILES ++ generate_bcq_output_arrays ++ preserve_bcq_info ++) ++ ++foreach(BCQ_TOOLS IN ITEMS ${BCQ_TOOLS_FILES}) ++ ++ set(BCQ_TOOLS_FILE ${BCQ_TOOLS}) ++ set(BCQ_TOOLS_SRC "${CMAKE_CURRENT_SOURCE_DIR}/${BCQ_TOOLS_FILE}") ++ set(BCQ_TOOLS_BIN "${CMAKE_CURRENT_BINARY_DIR}/${BCQ_TOOLS_FILE}") ++ set(BCQ_TOOLS_TARGET "${BCQ_TOOLS}_target") ++ ++ add_custom_command(OUTPUT ${BCQ_TOOLS_BIN} ++ COMMAND ${CMAKE_COMMAND} -E copy "${BCQ_TOOLS_SRC}" "${BCQ_TOOLS_BIN}" ++ DEPENDS ${BCQ_TOOLS_SRC} ++ COMMENT "Generate ${BCQ_TOOLS_BIN}" ++ ) ++ ++ add_custom_target(${BCQ_TOOLS_TARGET} ALL DEPENDS ${BCQ_TOOLS_BIN}) ++ ++ install(FILES ${BCQ_TOOLS_BIN} ++ PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE ++ GROUP_READ GROUP_WRITE GROUP_EXECUTE ++ WORLD_READ WORLD_EXECUTE ++ DESTINATION bin) ++ ++endforeach(BCQ_TOOLS) +diff --git a/compiler/bcq-tools/README.md b/compiler/bcq-tools/README.md +new file mode 100644 +index 0000000..18b0f48 +--- /dev/null ++++ b/compiler/bcq-tools/README.md +@@ -0,0 +1,78 @@ ++# BCQ Tools ++ ++This directory includes some tools related with BCQ. ++ ++## preserve_bcq_info ++ ++### Purpose ++ ++`preserve_bcq_info` is for preserving constant nodes which include BCQ information. ++When `.pb` file is converted to `.tflite` file by TFlite converter, constant nodes whose values are exactly same are removed and then linked to only one representative node. ++This makes us impossible to know what constant node should be linked to a node which we want to apply BCQ. ++One of the solutions is making all the same constant nodes different by inserting unique values and ignore the newly generated unique values when BCQ fusing is applied. ++`preserve_bcq_info` will generate and insert unique dummy values to the constant nodes whose values are same not to be removed by Tensorflow Lite converter. ++As a result, BCQ information will be preserved. ++ ++### How to use ++ ++```bash ++preserve_bcq_info \ ++--input_path /path/to/original_model.pb \ ++--output_path /path/to/preserved_model.pb ++``` ++ ++### How it works ++ ++If we add unique dummy value at the end of each constant nodes, all the constant nodes would be different. Following is an example. ++ ++``` ++[Original Constant Nodes] ++const(value=[1, 2, 3], name='const1') ++const(value=[1, 2, 3], name='const2') ++const(value=[1, 2, 3], name='const3') ++ ++[After BCQ information preserved] ++const(value=[1, 2, 3, -1], name='const1') ++const(value=[1, 2, 3, -2], name='const2') ++const(value=[1, 2, 3, -3], name='const3') ++``` ++ ++For dummy values, negative values are used instead of positive values. ++This is because positive valus may be confused with original constant node values. ++For your information, unique dummy value starts from -1 and moves to -2, -3, ..., -N, where N is the number of preserved constant nodes. ++ ++### Caution ++ ++- Newly generated dummy values should be ignored when the constant nodes are used. ++ ++## generate_bcq_output_arrays ++ ++### Purpose ++ ++To apply BCQ, BCQ information nodes should be designated as model output so that they are alive even after TFLite conversion is finished. ++However, there are so many nodes to designate and sometimes we cannot copy and paste all of them because the string size is too big. ++`generate_bcq_output_arrays` is for generating output_arrays, which include BCQ information nodes. ++ ++### How to use ++ ++```bash ++generate_bcq_output_arrays \ ++--input_path /path/to/original_model.pb \ ++--output_path /path/to/output_arrays.txt ++``` ++ ++### How it works ++ ++``` ++[Original BCQ information nodes] ++const(value=[1, 2, 3, -1], name='const1') ++const(value=[1, 2, 3, -2], name='const2') ++const(value=[1, 2, 3, -3], name='const3') ++ ++[Generated output_arrays] ++,const1,const2,const3 ++``` ++ ++### Caution ++ ++- Generated output_arrays will be start with comma. +diff --git a/compiler/bcq-tools/generate_bcq_output_arrays b/compiler/bcq-tools/generate_bcq_output_arrays +new file mode 100644 +index 0000000..48e8a93 +--- /dev/null ++++ b/compiler/bcq-tools/generate_bcq_output_arrays +@@ -0,0 +1,90 @@ ++#!/usr/bin/env python3 ++ ++import tensorflow as tf ++ ++import argparse ++import sys ++ ++ ++def _get_parser(): ++ """ ++ Returns an ArgumentParser for generating output_arrays. ++ """ ++ parser = argparse.ArgumentParser( ++ description=("Command line tool to generated output_arrays of BCQ nodes")) ++ ++ # Input and output path. ++ parser.add_argument( ++ "-i", ++ "--input_path", ++ type=str, ++ help="Full filepath of the input file.", ++ required=True) ++ parser.add_argument( ++ "-o", ++ "--output_path", ++ type=str, ++ help="Full filepath of the output file.", ++ required=True) ++ ++ return parser ++ ++ ++def load_graph(frozen_graph_filename): ++ """ ++ Load graph from frozen pb file ++ """ ++ with tf.compat.v1.gfile.GFile(frozen_graph_filename, "rb") as f: ++ graph_def = tf.compat.v1.GraphDef() ++ graph_def.ParseFromString(f.read()) ++ with tf.Graph().as_default() as graph: ++ tf.import_graph_def(graph_def, name='') ++ return graph ++ ++ ++def dtype2str(dtype): ++ if dtype == "int32": ++ return "TF_INT32" ++ elif dtype == "int64": ++ return "TF_INT64" ++ elif dtype == "float32": ++ return "TF_FLOAT" ++ elif dtype == "bool": ++ return "TF_BOOL" ++ else: ++ raise Exception("Not supported dtype") ++ ++ ++def print_output_arrays(flags): ++ graph_model = load_graph(flags.input_path) ++ graph_model_def = graph_model.as_graph_def() ++ ops = graph_model.get_operations() ++ ++ output_names = [op.outputs[0].name for op in ops ++ if op.type == "Const" and "bcqinfo_" in op.outputs[0].name] ++ ++ output_arrays = "" ++ for output_name in output_names: ++ output_arrays += "," ++ ++ colon_index = output_name.find(":") ++ if colon_index == -1: ++ output_arrays += output_name ++ else: ++ output_arrays += output_name[:colon_index] ++ ++ f = open(flags.output_path, 'w') ++ f.write(output_arrays) ++ f.close() ++ ++ ++def main(): ++ # Parse argument. ++ parser = _get_parser() ++ flags = parser.parse_known_args(args=sys.argv[1:]) ++ ++ print_output_arrays(flags[0]) ++ ++ ++if __name__ == "__main__": ++ main() +diff --git a/compiler/bcq-tools/preserve_bcq_info b/compiler/bcq-tools/preserve_bcq_info +new file mode 100644 +index 0000000..2ede8d4 +--- /dev/null ++++ b/compiler/bcq-tools/preserve_bcq_info +@@ -0,0 +1,116 @@ ++#!/usr/bin/env python3 ++ ++import tensorflow as tf ++import numpy as np ++ ++import argparse ++import sys ++ ++ ++def _get_parser(): ++ """ ++ Returns an ArgumentParser for preserving BCQ information. ++ """ ++ parser = argparse.ArgumentParser( ++ description=("Command line tool to preserve BCQ information")) ++ ++ # Input and output path. ++ parser.add_argument( ++ "-i", ++ "--input_path", ++ type=str, ++ help="Full filepath of the input file.", ++ required=True) ++ parser.add_argument( ++ "-o", ++ "--output_path", ++ type=str, ++ help="Full filepath of the output file.", ++ required=True) ++ ++ return parser ++ ++ ++def load_graph(frozen_graph_filename): ++ """ ++ Load graph from frozen pb file ++ """ ++ with tf.compat.v1.gfile.GFile(frozen_graph_filename, "rb") as f: ++ graph_def = tf.compat.v1.GraphDef() ++ graph_def.ParseFromString(f.read()) ++ with tf.Graph().as_default() as graph: ++ tf.import_graph_def(graph_def, name='') ++ return graph ++ ++ ++def preserve_bcq_info(flags): ++ """ ++ Generate unique dummy value from -1 to -N. ++ ++ We use negative values to preserve BCQ information because ++ positive values may cause some confusion with real BCQ information values. ++ """ ++ ++ class UniqueValueGen: ++ def __init__(self): ++ self.unique_value = -1 ++ ++ def gen(self): ++ val = self.unique_value ++ self.unique_value = val - 1 ++ return val ++ ++ unique_value = UniqueValueGen() ++ ++ original_graph_model = load_graph(flags.input_path) ++ original_graph_model_def = original_graph_model.as_graph_def() ++ ++ new_graph = tf.compat.v1.GraphDef() ++ substitution_dict = {} ++ ++ DT_INT32 = None # Just for copying DT_INT32 attribute value ++ ++ for node in original_graph_model_def.node: ++ if node.op == "Const": ++ # Because bcqinfo_do_w_x is BOOL type, we cannot add dummy value at the end. ++ # Therefore we should convert the type to INT32 type. ++ if "/bcqinfo_do_w_x" in node.name: ++ original_tensor = tf.make_ndarray(node.attr["value"].tensor) ++ substitution_dict[node.name] = tf.make_tensor_proto( ++ [int(original_tensor[0]), unique_value.gen()], tf.int32) ++ ++ preserved_bcqinfo_list = ["/bcqinfo_number_of_clusters", "/bcqinfo_size_of_clusters", ++ "/bcqinfo_qbits_of_clusters"] ++ ++ if any(name in node.name for name in preserved_bcqinfo_list): ++ original_tensor = tf.make_ndarray( ++ node.attr["value"].tensor) # variable name change ++ substitution_dict[node.name] = tf.make_tensor_proto( ++ np.append(original_tensor, unique_value.gen()), tf.int32) ++ DT_INT32 = node.attr["dtype"] ++ ++ for node in original_graph_model_def.node: ++ if node.name in substitution_dict: ++ new_node = new_graph.node.add() ++ new_node.op = "Const" ++ new_node.name = node.name ++ new_node.attr["dtype"].CopyFrom(DT_INT32) ++ new_node.attr["value"].tensor.CopyFrom(substitution_dict[node.name]) ++ else: ++ new_node = new_graph.node.add() ++ new_node.CopyFrom(node) ++ ++ tf.io.write_graph(new_graph, '.', flags.output_path, False) ++ ++ ++def main(): ++ # Parse argument. ++ parser = _get_parser() ++ flags = parser.parse_known_args(args=sys.argv[1:]) ++ ++ # Generate a new pb file, which BCQ information is preserved. ++ preserve_bcq_info(flags[0]) ++ ++ ++if __name__ == "__main__": ++ main() +diff --git a/compiler/circle-quantizer/CMakeLists.txt b/compiler/circle-quantizer/CMakeLists.txt +index 1335057..009bfab 100644 +--- a/compiler/circle-quantizer/CMakeLists.txt ++++ b/compiler/circle-quantizer/CMakeLists.txt +@@ -13,5 +13,6 @@ target_link_libraries(circle-quantizer luci_service) + target_link_libraries(circle-quantizer luci_pass) + target_link_libraries(circle-quantizer luci_export) + target_link_libraries(circle-quantizer arser) ++target_link_libraries(circle-quantizer vconone) + + install(TARGETS circle-quantizer DESTINATION bin) +diff --git a/compiler/circle-quantizer/requires.cmake b/compiler/circle-quantizer/requires.cmake +index 2293e53..c21e28e 100644 +--- a/compiler/circle-quantizer/requires.cmake ++++ b/compiler/circle-quantizer/requires.cmake +@@ -5,3 +5,4 @@ require("safemain") + require("luci") + require("oops") + require("arser") ++require("vconone") +diff --git a/compiler/circle-quantizer/src/CircleQuantizer.cpp b/compiler/circle-quantizer/src/CircleQuantizer.cpp +index b56b547..8d3a80c 100644 +--- a/compiler/circle-quantizer/src/CircleQuantizer.cpp ++++ b/compiler/circle-quantizer/src/CircleQuantizer.cpp +@@ -25,6 +25,7 @@ + + #include + #include ++#include + + #include + #include +@@ -36,6 +37,12 @@ using OptionHook = std::function; + using Algorithms = luci::CircleOptimizer::Options::Algorithm; + using AlgorithmParameters = luci::CircleOptimizer::Options::AlgorithmParameters; + ++void print_version(void) ++{ ++ std::cout << "circle-quantizer version " << vconone::get_string() << std::endl; ++ std::cout << vconone::get_copyright() << std::endl; ++} ++ + int entry(int argc, char **argv) + { + // Simple argument parser (based on map) +@@ -49,13 +56,20 @@ int entry(int argc, char **argv) + + arser::Arser arser("circle-quantizer provides circle model quantization"); + ++ arser.add_argument("--version") ++ .nargs(0) ++ .required(false) ++ .default_value(false) ++ .help("Show version information and exit") ++ .exit_with(print_version); ++ + arser.add_argument(qdqw) + .nargs(3) + .type(arser::DataType::STR_VEC) + .required(false) + .help("Quantize-dequantize weight values required action before quantization. " + "Three arguments required: input_dtype(float32) " +- "output_dtype(uint8) granularity(layer)"); ++ "output_dtype(uint8) granularity(layer, channel)"); + + arser.add_argument(qwmm) + .nargs(3) +@@ -63,7 +77,7 @@ int entry(int argc, char **argv) + .required(false) + .help("Quantize with min/max values. " + "Three arguments required: input_dtype(float32) " +- "output_dtype(uint8) granularity(layer)"); ++ "output_dtype(uint8) granularity(layer, channel)"); + + arser.add_argument("input").nargs(1).type(arser::DataType::STR).help("Input circle model"); + arser.add_argument("output").nargs(1).type(arser::DataType::STR).help("Output circle model"); +diff --git a/compiler/circle-tensordump/driver/Driver.cpp b/compiler/circle-tensordump/driver/Driver.cpp +index a55cd45..38e3073 100644 +--- a/compiler/circle-tensordump/driver/Driver.cpp ++++ b/compiler/circle-tensordump/driver/Driver.cpp +@@ -46,7 +46,7 @@ int entry(int argc, char **argv) + { + std::cout << err.what() << std::endl; + std::cout << arser; +- return 0; ++ return 255; + } + + std::unique_ptr dump; +diff --git a/compiler/circle-tensordump/src/Dump.cpp b/compiler/circle-tensordump/src/Dump.cpp +index dfa78f0..a8d3256 100644 +--- a/compiler/circle-tensordump/src/Dump.cpp ++++ b/compiler/circle-tensordump/src/Dump.cpp +@@ -136,6 +136,7 @@ void DumpTensors::run(std::ostream &os, const circle::Model *model, const std::s + auto max = quant_param->max(); + auto scale = quant_param->scale(); + auto zero_point = quant_param->zero_point(); ++ auto quantized_dimension = quant_param->quantized_dimension(); + + os << " " + print_format2 + "   ├── min : "; + ::print_comma_sepearted(os, min); +@@ -146,9 +147,11 @@ void DumpTensors::run(std::ostream &os, const circle::Model *model, const std::s + os << " " + print_format2 + "   ├── scale : "; + ::print_comma_sepearted(os, scale); + os << std::endl; +- os << " " + print_format2 + "   └── zero_point : "; ++ os << " " + print_format2 + "   ├── zero_point : "; + ::print_comma_sepearted(os, zero_point); + os << std::endl; ++ os << " " + print_format2 + "   └── quantized_dimension : " << quantized_dimension; ++ os << std::endl; + } + + // buffer +@@ -229,7 +232,7 @@ std::vector hdf5_dims_cast(const flatbuffers::Vector *data, + } + + /** +- * This function writes data to given hdf5 file like below. ++ * This function writes vector data to given hdf5 file like below. + * + * GROUP "group_name" + * ㄴDATATYPE "type" +@@ -238,9 +241,9 @@ std::vector hdf5_dims_cast(const flatbuffers::Vector *data, + * ㄴDATA "data" + */ + template +-void write_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string dataset_name, +- const H5::PredType &type, const flatbuffers::Vector *data, +- std::vector dims) ++void write_vector_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string dataset_name, ++ const H5::PredType &type, const flatbuffers::Vector *data, ++ std::vector dims) + { + if (data == nullptr) + return; +@@ -250,6 +253,17 @@ void write_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string d + dataset->write(data->data(), type); + } + ++/// @brief This function writes scalar data to given hdf5 file ++template ++void write_scalar_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string dataset_name, ++ const H5::PredType &type, T data) ++{ ++ auto dataspace = std::make_unique(H5S_SCALAR); ++ auto dataset = std::make_unique( ++ file.createDataSet(group_name + "/" + dataset_name, type, *dataspace)); ++ dataset->write(&data, type); ++} ++ + } // namespace + + namespace circletensordump +@@ -297,8 +311,9 @@ void DumpTensorsToHdf5::run(std::ostream &os, const circle::Model *model, + auto buff_data_ptr = reader.buffers()->Get(buff_idx)->data(); + if (buff_data_ptr) + { +- ::write_data_to_hdf5(file, group_name, "weights", ::hdf5_dtype_cast(tensor->type()), +- buff_data_ptr, ::hdf5_dims_cast(buff_data_ptr, tensor->shape())); ++ ::write_vector_data_to_hdf5(file, group_name, "weights", ::hdf5_dtype_cast(tensor->type()), ++ buff_data_ptr, ++ ::hdf5_dims_cast(buff_data_ptr, tensor->shape())); + } + + // write quantization parameters +@@ -306,17 +321,20 @@ void DumpTensorsToHdf5::run(std::ostream &os, const circle::Model *model, + if (quant_param) + { + auto min = quant_param->min(); +- ::write_data_to_hdf5(file, group_name, "min", H5::PredType::NATIVE_FLOAT, min, +- ::hdf5_dims_cast(min)); ++ ::write_vector_data_to_hdf5(file, group_name, "min", H5::PredType::NATIVE_FLOAT, min, ++ ::hdf5_dims_cast(min)); + auto max = quant_param->max(); +- ::write_data_to_hdf5(file, group_name, "max", H5::PredType::NATIVE_FLOAT, max, +- ::hdf5_dims_cast(max)); ++ ::write_vector_data_to_hdf5(file, group_name, "max", H5::PredType::NATIVE_FLOAT, max, ++ ::hdf5_dims_cast(max)); + auto scale = quant_param->scale(); +- ::write_data_to_hdf5(file, group_name, "scale", H5::PredType::NATIVE_FLOAT, scale, +- ::hdf5_dims_cast(scale)); ++ ::write_vector_data_to_hdf5(file, group_name, "scale", H5::PredType::NATIVE_FLOAT, scale, ++ ::hdf5_dims_cast(scale)); + auto zero_point = quant_param->zero_point(); +- ::write_data_to_hdf5(file, group_name, "zero_point", H5::PredType::NATIVE_INT64, zero_point, +- ::hdf5_dims_cast(zero_point)); ++ ::write_vector_data_to_hdf5(file, group_name, "zero_point", H5::PredType::NATIVE_INT64, ++ zero_point, ::hdf5_dims_cast(zero_point)); ++ auto quantized_dimension = quant_param->quantized_dimension(); ++ ::write_scalar_data_to_hdf5(file, group_name, "quantized_dimension", ++ H5::PredType::NATIVE_INT32, quantized_dimension); + } + } + } +diff --git a/compiler/circle-verify/src/Driver.cpp b/compiler/circle-verify/src/Driver.cpp +index 1af31d9..7a44c65 100644 +--- a/compiler/circle-verify/src/Driver.cpp ++++ b/compiler/circle-verify/src/Driver.cpp +@@ -35,7 +35,7 @@ int entry(int argc, char **argv) + { + std::cout << err.what() << std::endl; + std::cout << arser; +- return 0; ++ return 255; + } + + auto verifier = std::make_unique(); +diff --git a/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt b/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt +index 6663cb9..4bcaae3 100644 +--- a/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt ++++ b/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt +@@ -1,25 +1,12 @@ + nnas_include(TargetRequire) + + unset(REQUIRED_TARGETS) +-list(APPEND REQUIRED_TARGETS circlechef) + list(APPEND REQUIRED_TARGETS circle-inspect) + list(APPEND REQUIRED_TARGETS circle-verify) + list(APPEND REQUIRED_TARGETS circle2circle) + list(APPEND REQUIRED_TARGETS dredd_rule_lib) +-list(APPEND REQUIRED_TARGETS tflchef) +-list(APPEND REQUIRED_TARGETS tflite2circle) + TargetRequire_Return(${REQUIRED_TARGETS}) + +-nncc_find_resource(TensorFlowLiteRecipes) +-nncc_find_resource(CircleRecipes) +- +-set(TFLITE_RECIPE_REPO "${TensorFlowLiteRecipes_DIR}") +-set(CIRCLE_RECIPE_REPO "${CircleRecipes_DIR}") +-unset(RECIPE_REPO) +- +-set(TEST_RECIPE_FILENAME "test.recipe") +-set(TEST_RULE_FILENAME "test.rule") +- + unset(TEST_DEPS) + unset(TEST_NAMES) + +@@ -27,21 +14,9 @@ set(options "") + set(oneValueArgs "") + set(multiValueArgs PASS) + +-macro(Add RECIPE) +- if(NOT EXISTS "${TFLITE_RECIPE_REPO}/${RECIPE}/test.recipe") +- if(NOT EXISTS "${CIRCLE_RECIPE_REPO}/${RECIPE}/test.recipe") +- message(FATAL_ERROR "Missing recipe of '${RECIPE}' test") +- else() +- set(RECIPE_REPO ${CIRCLE_RECIPE_REPO}) +- endif() +- else() +- set(RECIPE_REPO ${TFLITE_RECIPE_REPO}) +- endif() +- +- if(NOT EXISTS "${RECIPE_REPO}/${RECIPE}/test.rule") +- message(FATAL_ERROR "Missing rule of '${RECIPE}' test") +- endif() ++get_target_property(ARTIFACTS_BIN_PATH testDataGenerator BINARY_DIR) + ++macro(Add RECIPE) + cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + unset(OPT_OPTIONS) + foreach(src ${ARG_PASS}) +@@ -49,71 +24,20 @@ macro(Add RECIPE) + list(APPEND OPT_OPTIONS "--${src}") + endforeach(src ${ARG_PASS}) + +- set(RECIPE_FILE "${RECIPE}.recipe") +- set(RECIPE_SOURCE_PATH "${RECIPE_REPO}/${RECIPE}/${TEST_RECIPE_FILENAME}") +- set(RECIPE_BINARY_PATH "${CMAKE_CURRENT_BINARY_DIR}/${RECIPE_FILE}") +- +- set(RULE_FILE "${RECIPE}.rule") +- set(RULE_SOURCE_PATH "${RECIPE_REPO}/${RECIPE}/${TEST_RULE_FILENAME}") +- set(RULE_BINARY_PATH "${CMAKE_CURRENT_BINARY_DIR}/${RULE_FILE}") +- +- set(TFLITE_FILE "${RECIPE}.tflite") +- set(TFLITE_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/${TFLITE_FILE}") +- + set(CIRCLE_FILE "${RECIPE}.circle") +- set(CIRCLE_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/${CIRCLE_FILE}") ++ set(CIRCLE_PATH "${ARTIFACTS_BIN_PATH}/${CIRCLE_FILE}") + + set(OPT_CIRCLE_FILE "${RECIPE}.opt.circle") + set(OPT_CIRCLE_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/${OPT_CIRCLE_FILE}") + +- # Copy .recipe +- add_custom_command(OUTPUT ${RECIPE_BINARY_PATH} +- COMMAND ${CMAKE_COMMAND} -E copy "${RECIPE_SOURCE_PATH}" "${RECIPE_BINARY_PATH}" +- DEPENDS ${RECIPE_SOURCE_PATH} +- COMMENT "Generate ${RECIPE_FILE}" +- ) +- +- # Copy .rule +- add_custom_command(OUTPUT ${RULE_BINARY_PATH} +- COMMAND ${CMAKE_COMMAND} -E copy "${RULE_SOURCE_PATH}" "${RULE_BINARY_PATH}" +- DEPENDS ${RULE_SOURCE_PATH} +- COMMENT "Generate ${RULE_FILE}" +- ) +- +- if(${RECIPE_REPO} STREQUAL ${TFLITE_RECIPE_REPO}) +- # Generate .tflite +- add_custom_command(OUTPUT ${TFLITE_OUTPUT_PATH} +- COMMAND $ ${RECIPE_BINARY_PATH} ${TFLITE_OUTPUT_PATH} +- DEPENDS $ ${RECIPE_BINARY_PATH} +- COMMENT "Generate ${TFLITE_FILE}" +- ) +- +- # Generate .circle +- add_custom_command(OUTPUT ${CIRCLE_OUTPUT_PATH} +- COMMAND $ ${TFLITE_OUTPUT_PATH} ${CIRCLE_OUTPUT_PATH} +- DEPENDS $ ${TFLITE_OUTPUT_PATH} +- COMMENT "Generate ${CIRCLE_FILE}" +- ) +- +- list(APPEND TEST_DEPS ${TFLITE_OUTPUT_PATH}) +- else() +- # Generate .circle +- add_custom_command(OUTPUT ${CIRCLE_OUTPUT_PATH} +- COMMAND $ ${RECIPE_BINARY_PATH} ${CIRCLE_OUTPUT_PATH} +- DEPENDS $ ${RECIPE_BINARY_PATH} +- COMMENT "Generate ${CIRCLE_FILE}" +- ) +- endif() +- + # Generate optimized .circle + add_custom_command(OUTPUT ${OPT_CIRCLE_OUTPUT_PATH} +- COMMAND $ ${OPT_OPTIONS} ${CIRCLE_OUTPUT_PATH} ${OPT_CIRCLE_OUTPUT_PATH} +- DEPENDS $ ${CIRCLE_OUTPUT_PATH} ++ COMMAND $ ${OPT_OPTIONS} ${CIRCLE_PATH} ${OPT_CIRCLE_OUTPUT_PATH} ++ DEPENDS $ ${CIRCLE_PATH} + COMMENT "Generate ${OPT_CIRCLE_FILE}" + ) + +- list(APPEND TEST_DEPS ${RECIPE_BINARY_PATH} ${RULE_BINARY_PATH} +- ${CIRCLE_OUTPUT_PATH} ${OPT_CIRCLE_OUTPUT_PATH}) ++ list(APPEND TEST_DEPS ${OPT_CIRCLE_OUTPUT_PATH}) + list(APPEND TEST_NAMES ${RECIPE}) + endmacro(Add) + +@@ -174,12 +98,15 @@ list(APPEND TEST_DEPS "${RULE_LIB_BINARY_PATH}") + + # Generate dependencies + add_custom_target(circle2circle_dredd_recipe_test ALL DEPENDS ${TEST_DEPS}) ++add_dependencies(circle2circle_dredd_recipe_test common_artifacts_deps) ++ ++get_target_property(ARTIFACTS_BIN_PATH testDataGenerator BINARY_DIR) + + # Run tests + add_test( + NAME circle2circle_dredd_recipe_test + COMMAND "${TEST_RUNNER}" + "${TEST_CONFIG}" +- "${CMAKE_CURRENT_BINARY_DIR}" ++ "${ARTIFACTS_BIN_PATH}" + ${TEST_NAMES} + ) +diff --git a/compiler/circle2circle-dredd-recipe-test/requires.cmake b/compiler/circle2circle-dredd-recipe-test/requires.cmake +index e4a5b71..70e7c52 100644 +--- a/compiler/circle2circle-dredd-recipe-test/requires.cmake ++++ b/compiler/circle2circle-dredd-recipe-test/requires.cmake +@@ -1,7 +1,5 @@ +-require("circlechef") + require("circle2circle") + require("circle-inspect") + require("circle-verify") ++require("common-artifacts") + require("dredd-rule-lib") +-require("tflchef") +-require("tflite2circle") +diff --git a/compiler/circle2circle-dredd-recipe-test/test.lst b/compiler/circle2circle-dredd-recipe-test/test.lst +index 202f669..6328a64 100644 +--- a/compiler/circle2circle-dredd-recipe-test/test.lst ++++ b/compiler/circle2circle-dredd-recipe-test/test.lst +@@ -11,9 +11,10 @@ + ## TFLITE RECIPE + + Add(Net_InstanceNorm_001 PASS fuse_instnorm) +-# Add(Net_InstanceNorm_002 PASS fuse_instnorm) ++Add(Net_InstanceNorm_002 PASS fuse_instnorm) + Add(BatchMatMulV2_000 PASS resolve_customop_batchmatmul) + Add(MatMul_000 PASS resolve_customop_matmul) ++Add(DepthwiseConv2D_003 PASS) + + ## CIRCLE RECIPE + +diff --git a/compiler/circle2circle-dredd-recipe-test/testall.sh b/compiler/circle2circle-dredd-recipe-test/testall.sh +index 33a2036..2899587 100755 +--- a/compiler/circle2circle-dredd-recipe-test/testall.sh ++++ b/compiler/circle2circle-dredd-recipe-test/testall.sh +@@ -13,21 +13,22 @@ if [[ $# -lt 2 ]]; then + exit 255 + fi + ++WORKDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + CONFIG_PATH="$1"; shift +-WORKDIR="$1"; shift ++RESOURCE_DIR="$1"; shift + + source "${CONFIG_PATH}" + + echo "-- Found circle-inspect: ${CIRCLE_INSPECT_PATH}" + echo "-- Found circle-verify: ${CIRCLE_VERIFY_PATH}" + echo "-- Found circle2circle: ${CIRCLE2CIRCLE_PATH}" +-echo "-- Found workdir: ${WORKDIR}" ++echo "-- Found common-artifacts: ${RESOURCE_DIR}" + + TESTED=() + PASSED=() + FAILED=() + +-pushd "${WORKDIR}" ++pushd ${WORKDIR} + while [[ $# -ne 0 ]]; do + PREFIX="$1"; shift + +@@ -40,7 +41,7 @@ while [[ $# -ne 0 ]]; do + cat > "${PREFIX}.log" <( + exec 2>&1 + +- echo "-- Found tflite: ${PREFIX}.tflite" ++ echo "-- Found circle: ${PREFIX}.opt.circle" + + # Exit immediately if any command fails + set -e +@@ -55,7 +56,7 @@ while [[ $# -ne 0 ]]; do + set +x + + # (COMPILED_FILE, INSPECT_PROG_PATH, VERIFY_PROG_PATH, ERROR_LOG) must be set for rule-lib.sh +- COMPILED_FILE="${WORKDIR}/${PREFIX}.opt.circle" ++ COMPILED_FILE="${PREFIX}.opt.circle" + INSPECT_PROG_PATH=${CIRCLE_INSPECT_PATH} + VERIFY_PROG_PATH=${CIRCLE_VERIFY_PATH} + ERROR_LOG="${PREFIX}.error" +@@ -66,7 +67,7 @@ while [[ $# -ne 0 ]]; do + trap 'echo "** ERROR **" ; cat "${ERROR_LOG}"' ERR + + source rule-lib.sh +- source "${PREFIX}.rule" ++ source "${RESOURCE_DIR}/${PREFIX}.rule" + + # unset + trap - ERR +diff --git a/compiler/circle2circle/CMakeLists.txt b/compiler/circle2circle/CMakeLists.txt +index 7b2bf9b..f60c896 100644 +--- a/compiler/circle2circle/CMakeLists.txt ++++ b/compiler/circle2circle/CMakeLists.txt +@@ -19,6 +19,7 @@ target_link_libraries(circle2circle luci_service) + target_link_libraries(circle2circle luci_pass) + target_link_libraries(circle2circle luci_export) + target_link_libraries(circle2circle arser) ++target_link_libraries(circle2circle vconone) + + install(TARGETS circle2circle DESTINATION bin) + +@@ -44,3 +45,4 @@ target_link_libraries(circle2circle_test luci_service) + target_link_libraries(circle2circle_test luci_pass) + target_link_libraries(circle2circle_test luci_export) + target_link_libraries(circle2circle_test arser) ++target_link_libraries(circle2circle_test vconone) +diff --git a/compiler/circle2circle/requires.cmake b/compiler/circle2circle/requires.cmake +index 8cbb90d..36a9efd 100644 +--- a/compiler/circle2circle/requires.cmake ++++ b/compiler/circle2circle/requires.cmake +@@ -9,3 +9,4 @@ require("hermes") + require("hermes-std") + require("luci") + require("arser") ++require("vconone") +diff --git a/compiler/circle2circle/src/Circle2Circle.cpp b/compiler/circle2circle/src/Circle2Circle.cpp +index 6888d26..849597b 100644 +--- a/compiler/circle2circle/src/Circle2Circle.cpp ++++ b/compiler/circle2circle/src/Circle2Circle.cpp +@@ -26,6 +26,7 @@ + + #include + #include ++#include + + #include + #include +@@ -34,6 +35,12 @@ + using Algorithms = luci::CircleOptimizer::Options::Algorithm; + using AlgorithmParameters = luci::CircleOptimizer::Options::AlgorithmParameters; + ++void print_version(void) ++{ ++ std::cout << "circle2circle version " << vconone::get_string() << std::endl; ++ std::cout << vconone::get_copyright() << std::endl; ++} ++ + int entry(int argc, char **argv) + { + // Simple argument parser (based on map) +@@ -44,6 +51,13 @@ int entry(int argc, char **argv) + + arser::Arser arser("circle2circle provides circle model optimization and transformations"); + ++ arser.add_argument("--version") ++ .nargs(0) ++ .required(false) ++ .default_value(false) ++ .help("Show version information and exit") ++ .exit_with(print_version); ++ + arser.add_argument("--all").nargs(0).required(false).default_value(false).help( + "Enable all optimize options"); + +diff --git a/compiler/circlechef/CMakeLists.txt b/compiler/circlechef/CMakeLists.txt +index cba7d0a..3e2ddcb 100644 +--- a/compiler/circlechef/CMakeLists.txt ++++ b/compiler/circlechef/CMakeLists.txt +@@ -18,4 +18,6 @@ add_subdirectory(core) + add_subdirectory(circle) + # Tools + add_subdirectory(tools) +-add_subdirectory(tests) ++if(ENABLE_TEST) ++ add_subdirectory(tests) ++endif(ENABLE_TEST) +diff --git a/compiler/circlechef/circle/src/RecipeChef.cpp b/compiler/circlechef/circle/src/RecipeChef.cpp +index 17ef1be..51326c7 100644 +--- a/compiler/circlechef/circle/src/RecipeChef.cpp ++++ b/compiler/circlechef/circle/src/RecipeChef.cpp +@@ -181,6 +181,8 @@ std::unique_ptr generate_recipe(const circle::Model *model) + for (uint32_t idx = 0; idx < quant->zero_point()->size(); ++idx) + chef_quant->add_zero_point(quant->zero_point()->Get(idx)); + } ++ circlechef::TensorQuantization *chef_quant = operand->mutable_quant(); ++ chef_quant->set_quantized_dimension(quant->quantized_dimension()); + } + } + +diff --git a/compiler/circlechef/core/src/ModelChef.cpp b/compiler/circlechef/core/src/ModelChef.cpp +index 76aeacd..d81467d 100644 +--- a/compiler/circlechef/core/src/ModelChef.cpp ++++ b/compiler/circlechef/core/src/ModelChef.cpp +@@ -413,6 +413,7 @@ template void cook_graph(const T &graph, CookParams &cp) + quant_builder.add_min(quant_min); + quant_builder.add_scale(quant_scale); + quant_builder.add_zero_point(quant_zero_point); ++ quant_builder.add_quantized_dimension(quant.quantized_dimension()); + + // Update QuantizationParameters Index + quant_index = quant_builder.Finish(); +diff --git a/compiler/circlechef/proto/circlechef.proto b/compiler/circlechef/proto/circlechef.proto +index b8c009b..3e5e6b1 100644 +--- a/compiler/circlechef/proto/circlechef.proto ++++ b/compiler/circlechef/proto/circlechef.proto +@@ -35,6 +35,7 @@ message TensorQuantization { + repeated float max = 2; + repeated float scale = 3; + repeated int64 zero_point = 4; ++ optional int32 quantized_dimension = 5 [default = 0]; + } + + message Operand { +diff --git a/compiler/circlechef/tools/file/Driver.cpp b/compiler/circlechef/tools/file/Driver.cpp +index a15da40..bcc0c7a 100644 +--- a/compiler/circlechef/tools/file/Driver.cpp ++++ b/compiler/circlechef/tools/file/Driver.cpp +@@ -41,7 +41,7 @@ int entry(int argc, char **argv) + { + std::cout << err.what() << std::endl; + std::cout << arser; +- return 0; ++ return 255; + } + + int32_t model_version = 1; +diff --git a/compiler/circlechef/tools/reverse/Driver.cpp b/compiler/circlechef/tools/reverse/Driver.cpp +index 9c0b9ea..8a2b85f 100644 +--- a/compiler/circlechef/tools/reverse/Driver.cpp ++++ b/compiler/circlechef/tools/reverse/Driver.cpp +@@ -38,7 +38,7 @@ int entry(int argc, char **argv) + { + std::cout << err.what() << std::endl; + std::cout << arser; +- return 0; ++ return 255; + } + + std::string circle_path = arser.get("circle"); +diff --git a/compiler/circledump/driver/Driver.cpp b/compiler/circledump/driver/Driver.cpp +index b8f561f..657f24f 100644 +--- a/compiler/circledump/driver/Driver.cpp ++++ b/compiler/circledump/driver/Driver.cpp +@@ -33,7 +33,7 @@ int entry(int argc, char **argv) + { + std::cout << err.what() << '\n'; + std::cout << arser; +- return 0; ++ return 255; + } + + std::string circle_path = arser.get("circle"); +diff --git a/compiler/circledump/src/OpPrinter.cpp b/compiler/circledump/src/OpPrinter.cpp +index 2c03203..5aa5d51 100644 +--- a/compiler/circledump/src/OpPrinter.cpp ++++ b/compiler/circledump/src/OpPrinter.cpp +@@ -593,6 +593,20 @@ public: + } + }; + ++class UniquePrinter : public OpPrinter ++{ ++public: ++ void options(const circle::Operator *op, std::ostream &os) const override ++ { ++ if (auto *params = op->builtin_options_as_UniqueOptions()) ++ { ++ os << " "; ++ os << "idx_out_type(" << EnumNameTensorType(params->idx_out_type()) << ") "; ++ os << std::endl; ++ } ++ } ++}; ++ + class WhilePrinter : public OpPrinter + { + public: +@@ -744,6 +758,7 @@ OpPrinterRegistry::OpPrinterRegistry() + _op_map[circle::BuiltinOperator_SUM] = make_unique(); + _op_map[circle::BuiltinOperator_TRANSPOSE_CONV] = make_unique(); + // There is no Option for TOPK_V2 ++ _op_map[circle::BuiltinOperator_UNIQUE] = make_unique(); + _op_map[circle::BuiltinOperator_WHILE] = make_unique(); + _op_map[circle::BuiltinOperator_CUSTOM] = make_unique(); + +diff --git a/compiler/common-artifacts/exclude.lst b/compiler/common-artifacts/exclude.lst +index b614b71..d3f5601 100644 +--- a/compiler/common-artifacts/exclude.lst ++++ b/compiler/common-artifacts/exclude.lst +@@ -5,9 +5,12 @@ + + #[[ optimize : Exclude from circle optimization(circle2circle) ]] + ## TensorFlowLiteRecipes +-optimize(ReLU6_000) +-optimize(Where_000) +-optimize(Where_001) ++optimize(Unique_000) ++optimize(Unique_001) ++optimize(Unique_002) ++optimize(Unique_003) ++optimize(Unique_U8_000) ++optimize(Unique_U8_001) + + ## CircleRecipes + +@@ -46,6 +49,7 @@ tcgenerate(DepthToSpace_000) + tcgenerate(DepthwiseConv2D_001) # runtime doesn't support dilation + tcgenerate(DepthwiseConv2D_003) # runtime doesn't support dilation + tcgenerate(DepthwiseConv2D_U8_000) ++tcgenerate(DepthwiseConv2D_U8_001) # luci-interpreter doesn't support channel-wise quantization yet + tcgenerate(Div_000) + tcgenerate(ELU_000) + tcgenerate(Equal_000) +@@ -96,7 +100,7 @@ tcgenerate(Neg_000) + tcgenerate(Net_Dangle_001) + tcgenerate(Net_InstanceNorm_001) + tcgenerate(Net_InstanceNorm_002) +-tcgenerate(Net_ZeroDim_001) # fix luci ++tcgenerate(Net_ZeroDim_001) # luci-interpreter doesn't support zero dim + tcgenerate(NotEqual_000) + tcgenerate(OneHot_000) + tcgenerate(OneHot_001) +@@ -120,9 +124,9 @@ tcgenerate(ReduceProd_001) + tcgenerate(ReduceProd_002) + tcgenerate(ReduceProd_003) + tcgenerate(ReLU_000) +-tcgenerate(ReLU6_000) # luci NYI ++tcgenerate(ReLU6_000) + tcgenerate(ReLUN1To1_000) +-tcgenerate(Reshape_003) # fix luci ++tcgenerate(Reshape_003) # luci-interpreter doesn't support reshape without built-in option + tcgenerate(Reshape_U8_000) + tcgenerate(ResizeBilinear_000) + tcgenerate(ResizeNearestNeighbor_000) +@@ -148,7 +152,7 @@ tcgenerate(SpaceToBatchND_002) + tcgenerate(SpaceToBatchND_003) + tcgenerate(SpaceToDepth_000) + tcgenerate(SparseToDense_000) +-tcgenerate(SplitV_000) # fix luci ++tcgenerate(SplitV_000) + tcgenerate(Sqrt_000) + tcgenerate(Square_000) + tcgenerate(SquaredDifference_000) +@@ -164,22 +168,21 @@ tcgenerate(Sum_001) + tcgenerate(Tanh_000) + tcgenerate(Tile_000) + tcgenerate(Tile_U8_000) +-tcgenerate(TopKV2_000) # fix luci +-tcgenerate(TopKV2_001) # fix luci +-tcgenerate(TransposeConv_000) # fix interpreter ++tcgenerate(TopKV2_000) ++tcgenerate(TopKV2_001) + tcgenerate(Unique_000) + tcgenerate(Unique_001) + tcgenerate(Unique_002) + tcgenerate(Unique_003) + tcgenerate(Unique_U8_000) + tcgenerate(Unique_U8_001) +-tcgenerate(Where_000) # luci NYI +-tcgenerate(Where_001) # luci NYI +-tcgenerate(While_000) # fix luci ++tcgenerate(Where_000) ++tcgenerate(Where_001) ++tcgenerate(While_000) + tcgenerate(While_001) + tcgenerate(While_002) + tcgenerate(While_003) +-tcgenerate(YUV_TO_RGB_000) # fix luci ++tcgenerate(YUV_TO_RGB_000) + tcgenerate(YUV_TO_RGB_U8_000) + tcgenerate(ZerosLike_000) + +diff --git a/compiler/hermes/src/hermes.test.cpp b/compiler/hermes/src/hermes.test.cpp +index 2cbc093..ea7ef65 100644 +--- a/compiler/hermes/src/hermes.test.cpp ++++ b/compiler/hermes/src/hermes.test.cpp +@@ -18,7 +18,28 @@ + + #include + +-TEST(HermesTest, simple_usecase) ++namespace + { +- // TO BE FILLED ++ ++class Logger final : public hermes::Source ++{ ++public: ++ Logger(hermes::Context *ctx); ++ ~Logger(); ++}; ++ ++Logger::Logger(hermes::Context *ctx) { activate(ctx->sources(), ctx->bus()); } ++Logger::~Logger() { deactivate(); } ++ ++} // namespace ++ ++TEST(HermesTest, logger_constructor_NEG) ++{ ++ hermes::Context context; ++ // we expect segmentfault from nullptr->sources() ++ ASSERT_DEATH(Logger logger(&context), ""); ++ ++ SUCCEED(); + } ++ ++// TODO add HermesTest simple_usecase +diff --git a/compiler/locomotiv/src/Node/BiasEncode.test.cpp b/compiler/locomotiv/src/Node/BiasEncode.test.cpp +index cdb255c..4680f5c 100644 +--- a/compiler/locomotiv/src/Node/BiasEncode.test.cpp ++++ b/compiler/locomotiv/src/Node/BiasEncode.test.cpp +@@ -90,6 +90,16 @@ template void test() + } + } // namespace + +-TEST(NodeExecution_BiasEncode, s32) { test(); } ++TEST(NodeExecution_BiasEncode, s32) ++{ ++ test(); ++ ++ SUCCEED(); ++} + +-TEST(NodeExecution_BiasEncode, f32) { test(); } ++TEST(NodeExecution_BiasEncode, f32) ++{ ++ test(); ++ ++ SUCCEED(); ++} +diff --git a/compiler/locomotiv/src/Node/MatMul.test.cpp b/compiler/locomotiv/src/Node/MatMul.test.cpp +index f1f3a52..7d942e1 100644 +--- a/compiler/locomotiv/src/Node/MatMul.test.cpp ++++ b/compiler/locomotiv/src/Node/MatMul.test.cpp +@@ -142,6 +142,8 @@ TEST(NodeExecution_MatMul, f32_2x3_3x3) + }; + + run_test(lhs, rhs, out, Shape{2, 3}, Shape{3, 3}, Shape{2, 3}, loco::DataType::FLOAT32); ++ ++ SUCCEED(); + } + + /* from the code below: +@@ -183,6 +185,8 @@ TEST(NodeExecution_MatMul, s32_4x2_2x6) + }; + + run_test(lhs, rhs, out, Shape{4, 2}, Shape{2, 6}, Shape{4, 6}, loco::DataType::S32); ++ ++ SUCCEED(); + } + + // clang-format on +diff --git a/compiler/locop/src/FormattedGraph.test.cpp b/compiler/locop/src/FormattedGraph.test.cpp +index c9808d3..aff9ebe 100644 +--- a/compiler/locop/src/FormattedGraph.test.cpp ++++ b/compiler/locop/src/FormattedGraph.test.cpp +@@ -28,6 +28,8 @@ TEST(LinearV1FormatterTest, simple) + + // TODO Validate the output (when the implementation becomes stable) + std::cout << locop::fmt(g) << std::endl; ++ ++ SUCCEED(); + } + + TEST(LinearV1FormatterTest, user_defined_node_summary_builder) +diff --git a/compiler/locop/src/FormattedTensorShape.test.cpp b/compiler/locop/src/FormattedTensorShape.test.cpp +index 0f0017a..fc85df3 100644 +--- a/compiler/locop/src/FormattedTensorShape.test.cpp ++++ b/compiler/locop/src/FormattedTensorShape.test.cpp +@@ -30,4 +30,6 @@ TEST(FormattedTensorShapeTest, BracketFormat) + tensor_shape->dim(0) = 4; + + std::cout << fmt(tensor_shape.get()) << std::endl; ++ ++ SUCCEED(); + } +diff --git a/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h b/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h +index 9987898..4ac3d86 100644 +--- a/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h ++++ b/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h +@@ -79,12 +79,11 @@ private: + // + // Note that due to historical and performance reasons, per-tensor quantization uses unsigned + // integer types, while per-channel uses signed types assuming 'zero_point' == 0. +-// +-// TODO Add 'quantized_dimension' field for per-channel case when IR provides it. + struct AffineQuantization + { + std::vector scale; + std::vector zero_point; ++ int32_t quantized_dimension; + }; + + class Tensor +@@ -108,6 +107,12 @@ public: + return _quantization.zero_point[0]; + } + ++ const std::vector &scales() const { return _quantization.scale; } ++ ++ const std::vector &zero_points() const { return _quantization.zero_point; } ++ ++ int32_t quantized_dimension() const { return _quantization.quantized_dimension; } ++ + template const T *data() const { return reinterpret_cast(_data.get()); } + + template T *data() { return reinterpret_cast(_data.get()); } +diff --git a/compiler/luci-interpreter/src/core/KernelParams.h b/compiler/luci-interpreter/src/core/KernelParams.h +index a32e0d4..65d1197 100644 +--- a/compiler/luci-interpreter/src/core/KernelParams.h ++++ b/compiler/luci-interpreter/src/core/KernelParams.h +@@ -56,6 +56,11 @@ struct Conv2DParams + Activation activation; + }; + ++struct DepthToSpaceParams ++{ ++ int block_size; ++}; ++ + struct DepthwiseConv2DParams + { + Padding padding; +diff --git a/compiler/luci-interpreter/src/kernels/CMakeLists.txt b/compiler/luci-interpreter/src/kernels/CMakeLists.txt +index fe36231..a1fd1de 100644 +--- a/compiler/luci-interpreter/src/kernels/CMakeLists.txt ++++ b/compiler/luci-interpreter/src/kernels/CMakeLists.txt +@@ -12,6 +12,8 @@ set(SOURCES + Concatenation.cpp + Conv2D.h + Conv2D.cpp ++ DepthToSpace.h ++ DepthToSpace.cpp + DepthwiseConv2D.h + DepthwiseConv2D.cpp + Elu.h +@@ -40,6 +42,10 @@ set(SOURCES + Pad.cpp + Reshape.h + Reshape.cpp ++ Reverse.h ++ Reverse.cpp ++ Slice.h ++ Slice.cpp + Softmax.h + Softmax.cpp + SpaceToDepth.h +@@ -77,6 +83,7 @@ set(TEST_SOURCES + AveragePool2D.test.cpp + Concatenation.test.cpp + Conv2D.test.cpp ++ DepthToSpace.test.cpp + DepthwiseConv2D.test.cpp + Elu.test.cpp + FullyConnected.test.cpp +@@ -91,6 +98,8 @@ set(TEST_SOURCES + Mul.test.cpp + Pad.test.cpp + Reshape.test.cpp ++ Reverse.test.cpp ++ Slice.test.cpp + Softmax.test.cpp + SpaceToDepth.test.cpp + Split.test.cpp +diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp b/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp +new file mode 100644 +index 0000000..cab63e2 +--- /dev/null ++++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp +@@ -0,0 +1,90 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#include "DepthToSpace.h" ++#include "Utils.h" ++#include ++ ++namespace luci_interpreter ++{ ++namespace kernels ++{ ++ ++DepthToSpace::DepthToSpace(const Tensor *input, Tensor *output, const DepthToSpaceParams ¶ms) ++ : KernelWithParams({input}, {output}, params) ++{ ++} ++ ++void DepthToSpace::configure() ++{ ++ if (input()->shape().num_dims() != 4) ++ { ++ throw std::runtime_error("Invalid input num_dims."); ++ } ++ if (output()->element_type() != DataType::FLOAT32 && output()->element_type() != DataType::U8 && ++ output()->element_type() != DataType::S8 && output()->element_type() != DataType::S32 && ++ output()->element_type() != DataType::S64) ++ { ++ throw std::runtime_error("Invalid output type"); ++ } ++ if (input()->element_type() != output()->element_type()) ++ { ++ throw std::runtime_error("Type mismatch on input and output."); ++ } ++ const int block_size = params().block_size; ++ const int32_t input_height = input()->shape().dim(1); ++ const int32_t input_width = input()->shape().dim(2); ++ const int32_t input_channels = input()->shape().dim(3); ++ int32_t output_height = input_height * block_size; ++ int32_t output_width = input_width * block_size; ++ int32_t output_channels = input_channels / block_size / block_size; ++ ++ assert(input_height == output_height / block_size); ++ assert(input_width == output_width / block_size); ++ assert(input_channels == output_channels * block_size * block_size); ++ ++ Shape output_shape(4); ++ output_shape.dim(0) = input()->shape().dim(0); ++ output_shape.dim(1) = output_height; ++ output_shape.dim(2) = output_width; ++ output_shape.dim(3) = output_channels; ++ ++ output()->resize(output_shape); ++} ++ ++void DepthToSpace::execute() const ++{ ++ tflite::DepthToSpaceParams op_params; ++ op_params.block_size = params().block_size; ++ switch (input()->element_type()) ++ { ++ case DataType::FLOAT32: ++ tflite::optimized_ops::DepthToSpace(op_params, getTensorShape(input()), ++ getTensorData(input()), getTensorShape(output()), ++ getTensorData(output())); ++ break; ++ case DataType::U8: ++ tflite::optimized_ops::DepthToSpace(op_params, getTensorShape(input()), ++ getTensorData(input()), getTensorShape(output()), ++ getTensorData(output())); ++ break; ++ default: ++ throw std::runtime_error("Unsupported Type."); ++ } ++} ++ ++} // namespace kernels ++} // namespace luci_interpreter +diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.h b/compiler/luci-interpreter/src/kernels/DepthToSpace.h +new file mode 100644 +index 0000000..63ce376 +--- /dev/null ++++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.h +@@ -0,0 +1,45 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#ifndef LUCI_INTERPRETER_KERNELS_DEPTHTOSPACE_H ++#define LUCI_INTERPRETER_KERNELS_DEPTHTOSPACE_H ++ ++#include "core/Kernel.h" ++#include "core/KernelParams.h" ++ ++#include ++ ++namespace luci_interpreter ++{ ++namespace kernels ++{ ++ ++class DepthToSpace : public KernelWithParams ++{ ++public: ++ DepthToSpace(const Tensor *input, Tensor *output, const DepthToSpaceParams ¶ms); ++ ++ const Tensor *input() const { return _inputs[0]; } ++ Tensor *output() const { return _outputs[0]; } ++ ++ void configure() override; ++ void execute() const override; ++}; ++ ++} // namespace kernels ++} // namespace luci_interpreter ++ ++#endif // LUCI_INTERPRETER_KERNELS_DEPTHTOSPACE_H +diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp b/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp +new file mode 100644 +index 0000000..1b80570 +--- /dev/null ++++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp +@@ -0,0 +1,60 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#include "kernels/DepthToSpace.h" ++#include "kernels/TestUtils.h" ++ ++namespace luci_interpreter ++{ ++namespace kernels ++{ ++namespace ++{ ++ ++using namespace testing; ++ ++template class DepthToSpaceTest : public ::testing::Test ++{ ++}; ++ ++using DataTypes = ::testing::Types; ++TYPED_TEST_CASE(DepthToSpaceTest, DataTypes); ++ ++TYPED_TEST(DepthToSpaceTest, SimpleCase) ++{ ++ std::vector input_data{1, 2, 3, 4, 5, 6, 7, 8}; ++ Shape input_shape{1, 1, 2, 4}; ++ std::vector output_data{1, 2, 5, 6, 3, 4, 7, 8}; ++ std::vector output_shape{1, 2, 4, 1}; ++ ++ Tensor input_tensor = makeInputTensor()>(input_shape, input_data); ++ Tensor output_tensor = makeOutputTensor(getElementType()); ++ ++ DepthToSpaceParams params{}; ++ params.block_size = 2; ++ ++ DepthToSpace kernel = DepthToSpace(&input_tensor, &output_tensor, params); ++ kernel.configure(); ++ kernel.execute(); ++ ++ EXPECT_THAT(extractTensorData(output_tensor), ++ ::testing::ElementsAreArray(output_data)); ++ EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape)); ++} ++ ++} // namespace ++} // namespace kernels ++} // namespace luci_interpreter +diff --git a/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp b/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp +index fad450d..f53eaca 100644 +--- a/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp ++++ b/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp +@@ -45,12 +45,9 @@ TEST(L2NormalizeTest, Float) + ElementsAreArray(ArrayFloatNear(ref_output_data))); + } + +-TEST(L2NormalizeTest, Uint8Quantized) +-{ +- // TODO +- // Implement GetDequantizedOutput Function. +- // Create Test for Uint8 Case +-} ++// TODO Uint8Quantized ++// Implement GetDequantizedOutput Function. ++// Create Test for Uint8 Case + + } // namespace + } // namespace kernels +diff --git a/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp b/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp +index b0c06e7..c79d3d6 100644 +--- a/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp ++++ b/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp +@@ -61,15 +61,14 @@ TEST(LeakReluTest, FloatSimple) + 1.0f, -0.5f, -1.0f, // Row 2 + }, + /*alpha=*/0.5f, getElementType()); +-} + +-TEST(LeakReluTest, Uint8Simple) +-{ +- // TODO +- // Implement GetDequantizedOutput Function. +- // Create Test for Uint8 Case ++ SUCCEED(); + } + ++// TODO Uint8Simple ++// Implement GetDequantizedOutput Function. ++// Create Test for Uint8 Case ++ + } // namespace + } // namespace kernels + } // namespace luci_interpreter +diff --git a/compiler/luci-interpreter/src/kernels/Logistic.test.cpp b/compiler/luci-interpreter/src/kernels/Logistic.test.cpp +index 17456a4..00feddf 100644 +--- a/compiler/luci-interpreter/src/kernels/Logistic.test.cpp ++++ b/compiler/luci-interpreter/src/kernels/Logistic.test.cpp +@@ -49,10 +49,8 @@ TEST(LogisticTest, Float) + // TODO make a Shape checking of output_tensor. + } + +-TEST(LogisticTest, Uint8) +-{ +- // Need to Implement GetDequantizedOutput Function. +-} ++// TODO Uint8 ++// Need to Implement GetDequantizedOutput Function. + + } // namespace + } // namespace kernels +diff --git a/compiler/luci-interpreter/src/kernels/Reverse.cpp b/compiler/luci-interpreter/src/kernels/Reverse.cpp +new file mode 100644 +index 0000000..a463084 +--- /dev/null ++++ b/compiler/luci-interpreter/src/kernels/Reverse.cpp +@@ -0,0 +1,81 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#include "kernels/Reverse.h" ++#include "kernels/Utils.h" ++#include ++ ++namespace luci_interpreter ++{ ++ ++namespace kernels ++{ ++ ++Reverse::Reverse(const Tensor *input, const Tensor *axes, Tensor *output) ++ : Kernel({input, axes}, {output}) ++{ ++} ++ ++void Reverse::configure() ++{ ++ assert(axes()->shape().num_dims() == 1); ++ assert(input()->shape().num_dims() >= axes()->shape().num_elements()); ++ if (input()->element_type() != DataType::S32 && input()->element_type() != DataType::FLOAT32 && ++ input()->element_type() != DataType::U8 && input()->element_type() != DataType::S16 && ++ input()->element_type() != DataType::S64) ++ { ++ throw std::runtime_error("Unsupported input type."); ++ } ++ if (axes()->element_type() != DataType::S32) ++ { ++ throw std::runtime_error("Unsupported axes type."); ++ } ++ if (axes()->shape().num_elements() > 1) ++ { ++ throw std::runtime_error("Current implementation does not support more than 1 axis."); ++ } ++ int axis_value = getTensorData(axes())[0]; ++ if (axis_value < 0 || axis_value >= input()->shape().num_dims()) ++ { ++ throw std::runtime_error("Invalid axes value"); ++ } ++ assert(input()->element_type() == output()->element_type()); ++ ++ output()->resize(input()->shape()); ++} ++ ++void Reverse::execute() const ++{ ++ int axis_value = getTensorData(axes())[0]; ++ switch (output()->element_type()) ++ { ++ case DataType::FLOAT32: ++ tflite::reference_ops::Reverse(axis_value, getTensorShape(input()), ++ getTensorData(input()), getTensorShape(output()), ++ getTensorData(output())); ++ break; ++ case DataType::U8: ++ tflite::reference_ops::Reverse( ++ axis_value, getTensorShape(input()), getTensorData(input()), ++ getTensorShape(output()), getTensorData(output())); ++ break; ++ default: ++ throw std::runtime_error("Unsupported output type"); ++ } ++} ++ ++} // namespace kernels ++} // namespace luci_interpreter +diff --git a/compiler/luci-interpreter/src/kernels/Reverse.h b/compiler/luci-interpreter/src/kernels/Reverse.h +new file mode 100644 +index 0000000..3489dae +--- /dev/null ++++ b/compiler/luci-interpreter/src/kernels/Reverse.h +@@ -0,0 +1,43 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#ifndef LUCI_INTERPRETER_KERNELS_REVERSE_H ++#define LUCI_INTERPRETER_KERNELS_REVERSE_H ++ ++#include "core/Kernel.h" ++ ++namespace luci_interpreter ++{ ++namespace kernels ++{ ++ ++class Reverse : public Kernel ++{ ++public: ++ Reverse(const Tensor *input, const Tensor *axes, Tensor *output); ++ ++ const Tensor *input() const { return _inputs[0]; } ++ const Tensor *axes() const { return _inputs[1]; } ++ Tensor *output() const { return _outputs[0]; } ++ ++ void configure() override; ++ void execute() const override; ++}; ++ ++} // namespace kernels ++} // namespace luci_interpreter ++ ++#endif // LUCI_INTERPRETER_KERNELS_REVERSE_H +diff --git a/compiler/luci-interpreter/src/kernels/Reverse.test.cpp b/compiler/luci-interpreter/src/kernels/Reverse.test.cpp +new file mode 100644 +index 0000000..5475a8b +--- /dev/null ++++ b/compiler/luci-interpreter/src/kernels/Reverse.test.cpp +@@ -0,0 +1,66 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright 2017 The TensorFlow Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#include "kernels/Reverse.h" ++#include "kernels/TestUtils.h" ++ ++namespace luci_interpreter ++{ ++namespace kernels ++{ ++namespace ++{ ++ ++using namespace testing; ++ ++template class ReverseTest : public ::testing::Test ++{ ++}; ++ ++using DataTypes = ::testing::Types; ++TYPED_TEST_CASE(ReverseTest, DataTypes); ++ ++TYPED_TEST(ReverseTest, MultiDimensions) ++{ ++ // TypeParam ++ std::vector input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ++ 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24}; ++ Shape input_shape{4, 3, 2}; ++ std::vector axis_data{1}; ++ Shape axis_shape{1}; ++ ++ std::vector output_data{5, 6, 3, 4, 1, 2, 11, 12, 9, 10, 7, 8, ++ 17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20}; ++ std::vector output_shape{4, 3, 2}; ++ ++ Tensor input_tensor = makeInputTensor()>(input_shape, input_data); ++ Tensor axis_tensor = makeInputTensor(axis_shape, axis_data); ++ ++ Tensor output_tensor = makeOutputTensor(getElementType()); ++ ++ Reverse kernel = Reverse(&input_tensor, &axis_tensor, &output_tensor); ++ kernel.configure(); ++ kernel.execute(); ++ ++ EXPECT_THAT(extractTensorData(output_tensor), ++ ::testing::ElementsAreArray(output_data)); ++ EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape)); ++} ++ ++} // namespace ++} // namespace kernels ++} // namespace luci_interpreter +diff --git a/compiler/luci-interpreter/src/kernels/Slice.cpp b/compiler/luci-interpreter/src/kernels/Slice.cpp +new file mode 100644 +index 0000000..c4bc3c5 +--- /dev/null ++++ b/compiler/luci-interpreter/src/kernels/Slice.cpp +@@ -0,0 +1,149 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#include "kernels/Slice.h" ++#include "Utils.h" ++#include ++ ++#include ++#include ++ ++namespace luci_interpreter ++{ ++ ++namespace kernels ++{ ++const int max_dim = 4; ++ ++Slice::Slice(const Tensor *input, const Tensor *begin, const Tensor *size, Tensor *output) ++ : Kernel({input, begin, size}, {output}) ++{ ++} ++ ++template ++Shape calculateOutputShape(const Tensor *input, const Tensor *begin, const Tensor *size) ++{ ++ Shape output_shape = Shape(input->shape().num_dims()); ++ for (int idx = 0; idx < input->shape().num_dims(); idx++) ++ { ++ T size_value = getTensorData(size)[idx]; ++ if (size_value < 0) ++ { ++ if (size_value != -1) ++ { ++ throw std::runtime_error("Invalid size."); ++ } ++ size_value = input->shape().dim(idx) - getTensorData(begin)[idx]; ++ } ++ else ++ { ++ if (input->shape().dim(idx) < getTensorData(begin)[idx] + size_value) ++ { ++ throw std::runtime_error("Invalid begin and size."); ++ } ++ } ++ output_shape.dim(idx) = static_cast(size_value); ++ } ++ return output_shape; ++} ++ ++template ++void getBeginAndSizeVectors(int dimensions, const Tensor *begin, const Tensor *size, ++ std::vector *begins, std::vector *sizes) ++{ ++ for (int idx = dimensions - 1; idx >= 0; --idx) ++ { ++ begins->push_back(getTensorData(begin)[idx]); ++ sizes->push_back(getTensorData(size)[idx]); ++ } ++} ++ ++void Slice::configure() ++{ ++ assert(input()->element_type() == output()->element_type()); ++ assert(begin()->element_type() == DataType::S32 || begin()->element_type() == DataType::S64); ++ assert(size()->element_type() == DataType::S32 || size()->element_type() == DataType::S64); ++ assert(begin()->shape().num_dims() == 1); ++ assert(size()->shape().num_dims() == 1); ++ assert(input()->shape().num_dims() <= max_dim); ++ ++ if (begin()->element_type() == DataType::S32) ++ { ++ output()->resize(calculateOutputShape(input(), begin(), size())); ++ } ++ else if (begin()->element_type() == DataType::S64) ++ { ++ output()->resize(calculateOutputShape(input(), begin(), size())); ++ } ++ else ++ { ++ throw std::runtime_error("Unsupported type."); ++ } ++} ++ ++void Slice::execute() const ++{ ++ std::vector begins; ++ begins.reserve(max_dim); ++ std::vector sizes; ++ sizes.reserve(max_dim); ++ if (begin()->element_type() == DataType::S32) ++ { ++ getBeginAndSizeVectors(input()->shape().num_dims(), begin(), size(), &begins, &sizes); ++ } ++ else if (begin()->element_type() == DataType::S64) ++ { ++ getBeginAndSizeVectors(input()->shape().num_dims(), begin(), size(), &begins, &sizes); ++ } ++ else ++ { ++ throw std::runtime_error("Unsupported begin type."); ++ } ++ for (int i = input()->shape().num_dims(); i < max_dim; ++i) ++ { ++ begins.push_back(0); ++ sizes.push_back(1); ++ } ++ ++ assert(begins.size() == 4); ++ assert(sizes.size() == 4); ++ tflite::SliceParams op_params{}; ++ op_params.begin_count = 4; ++ op_params.size_count = 4; ++ for (int i = 0; i < 4; i++) ++ { ++ op_params.begin[i] = begins[3 - i]; ++ op_params.size[i] = sizes[3 - i]; ++ } ++ switch (input()->element_type()) ++ { ++ case DataType::FLOAT32: ++ tflite::optimized_ops::Slice(op_params, getTensorShape(input()), ++ getTensorData(input()), getTensorShape(output()), ++ getTensorData(output())); ++ break; ++ case DataType::U8: ++ tflite::optimized_ops::Slice(op_params, getTensorShape(input()), ++ getTensorData(input()), getTensorShape(output()), ++ getTensorData(output())); ++ break; ++ default: ++ throw std::runtime_error("Unsupported input type."); ++ } ++} ++ ++} // namespace kernels ++} // namespace luci_interpreter +diff --git a/compiler/luci-interpreter/src/kernels/Slice.h b/compiler/luci-interpreter/src/kernels/Slice.h +new file mode 100644 +index 0000000..23c3596 +--- /dev/null ++++ b/compiler/luci-interpreter/src/kernels/Slice.h +@@ -0,0 +1,44 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#ifndef LUCI_INTERPRETER_KERNELS_SLICE_H ++#define LUCI_INTERPRETER_KERNELS_SLICE_H ++ ++#include "core/Kernel.h" ++ ++namespace luci_interpreter ++{ ++namespace kernels ++{ ++ ++class Slice : public Kernel ++{ ++public: ++ Slice(const Tensor *input, const Tensor *begin, const Tensor *size, Tensor *output); ++ ++ const Tensor *input() const { return _inputs[0]; } ++ const Tensor *begin() const { return _inputs[1]; } ++ const Tensor *size() const { return _inputs[2]; } ++ Tensor *output() const { return _outputs[0]; } ++ ++ void configure() override; ++ void execute() const override; ++}; ++ ++} // namespace kernels ++} // namespace luci_interpreter ++ ++#endif // LUCI_INTERPRETER_KERNELS_SLICE_H +diff --git a/compiler/luci-interpreter/src/kernels/Slice.test.cpp b/compiler/luci-interpreter/src/kernels/Slice.test.cpp +new file mode 100644 +index 0000000..a360a29 +--- /dev/null ++++ b/compiler/luci-interpreter/src/kernels/Slice.test.cpp +@@ -0,0 +1,64 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#include "kernels/Slice.h" ++#include "kernels/TestUtils.h" ++ ++namespace luci_interpreter ++{ ++namespace kernels ++{ ++namespace ++{ ++ ++using namespace testing; ++ ++template class SliceTest : public ::testing::Test ++{ ++}; ++ ++using DataTypes = ::testing::Types; ++TYPED_TEST_CASE(SliceTest, DataTypes); ++ ++TYPED_TEST(SliceTest, SimpleTest) ++{ ++ std::vector input_data{1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6}; ++ Shape input_shape{3, 2, 3, 1}; ++ std::vector begin_data{1, 0, 0, 0}; ++ Shape begin_shape{4}; ++ std::vector size_data{2, 1, -1, 1}; ++ Shape size_shape{4}; ++ std::vector output_data{3, 3, 3, 5, 5, 5}; ++ std::vector output_shape{2, 1, 3, 1}; ++ ++ Tensor input_tensor = makeInputTensor()>(input_shape, input_data); ++ Tensor begin_tensor = makeInputTensor(begin_shape, begin_data); ++ Tensor size_tensor = makeInputTensor(size_shape, size_data); ++ ++ Tensor output_tensor = makeOutputTensor(getElementType()); ++ ++ Slice kernel(&input_tensor, &begin_tensor, &size_tensor, &output_tensor); ++ kernel.configure(); ++ kernel.execute(); ++ ++ EXPECT_THAT(extractTensorData(output_tensor), ++ ::testing::ElementsAreArray(output_data)); ++ EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape)); ++} ++ ++} // namespace ++} // namespace kernels ++} // namespace luci_interpreter +diff --git a/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp b/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp +index 3386d36..b8c0ac4 100644 +--- a/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp ++++ b/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp +@@ -68,6 +68,8 @@ TEST(TransposeConvTest, FloatSimple) + /*output_data=*/{29, 62, 83, 75, 99, 192, 237, 198, 207, 372, 417, 330, 263, 446, 485, 365}, + /*params.padding=*/luci::Padding::SAME, /*stride_height=*/1, /*stride_width=*/1, + getElementType()); ++ ++ SUCCEED(); + } + + TEST(TransposeConvTest, FloatTwoFiltersTest) +@@ -82,21 +84,18 @@ TEST(TransposeConvTest, FloatTwoFiltersTest) + 3352, 3652, 2760}, + /*params.padding=*/luci::Padding::SAME, /*stride_height=*/1, /*stride_width=*/1, + getElementType()); +-} + +-TEST(TransposeConvTest, Uint8Simple) +-{ +- // TODO +- // Implement GetDequantizedOutput Function. +- // Create Test for Uint8 Case +-} +-TEST(TransposeConvTest, Uint8FiltersTest) +-{ +- // TODO +- // Implement GetDequantizedOutput Function. +- // Create Test for Uint8 Case ++ SUCCEED(); + } + ++// TODO Uint8Simple ++// Implement GetDequantizedOutput Function. ++// Create Test for Uint8 Case ++ ++// TODO Uint8FiltersTest ++// Implement GetDequantizedOutput Function. ++// Create Test for Uint8 Case ++ + } // namespace + } // namespace kernels + } // namespace luci_interpreter +diff --git a/compiler/luci-interpreter/src/loader/CMakeLists.txt b/compiler/luci-interpreter/src/loader/CMakeLists.txt +index fb36c4a..d99485d 100644 +--- a/compiler/luci-interpreter/src/loader/CMakeLists.txt ++++ b/compiler/luci-interpreter/src/loader/CMakeLists.txt +@@ -1,3 +1,5 @@ ++nnas_find_package(GTest REQUIRED) ++ + set(SOURCES + GraphLoader.h + GraphLoader.cpp +@@ -13,3 +15,8 @@ target_include_directories(luci_interpreter_loader PUBLIC "${LUCI_INTERPRETER_SO + target_link_libraries(luci_interpreter_loader + PUBLIC luci_lang luci_interpreter_core + PRIVATE luci_interpreter_kernels nncc_common) ++ ++set(TEST_SOURCES KernelBuilder.test.cpp) ++ ++GTest_AddTest(luci_interpreter_loader_test ${TEST_SOURCES}) ++target_link_libraries(luci_interpreter_loader_test luci_interpreter_loader) +diff --git a/compiler/luci-interpreter/src/loader/GraphLoader.cpp b/compiler/luci-interpreter/src/loader/GraphLoader.cpp +index 779fa06..6ebf979 100644 +--- a/compiler/luci-interpreter/src/loader/GraphLoader.cpp ++++ b/compiler/luci-interpreter/src/loader/GraphLoader.cpp +@@ -16,7 +16,6 @@ + + #include "loader/GraphLoader.h" + +-#include "loader/ModuleLoader.h" + #include "loader/KernelBuilder.h" + + #include +@@ -71,6 +70,7 @@ bool isExecutableNode(const luci::CircleNode *node) + { + // These nodes denote inputs / outputs of a graph. + case luci::CircleOpcode::CONST: ++ case luci::CircleOpcode::CIRCLECONST: + case luci::CircleOpcode::CIRCLEINPUT: + case luci::CircleOpcode::CIRCLEOUTPUT: + // The following nodes denote outputs of multiple-output nodes. +@@ -102,11 +102,12 @@ bool isTensorProducingNode(const luci::CircleNode *node) + + } // namespace + +-GraphLoader::GraphLoader(const ModuleLoader &module_loader, const loco::Graph *graph, +- RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir, +- std::unordered_map &node_to_tensor) +- : _module_loader(module_loader), _graph(graph), _runtime_graph(runtime_graph), +- _runtime_to_ir(runtime_to_ir), _node_to_tensor(node_to_tensor) ++GraphLoader::GraphLoader( ++ const loco::Graph *graph, RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir, ++ const std::unordered_map &graph_to_runtime_graph, ++ std::unordered_map &node_to_tensor) ++ : _graph(graph), _runtime_graph(runtime_graph), _runtime_to_ir(runtime_to_ir), ++ _graph_to_runtime_graph(graph_to_runtime_graph), _node_to_tensor(node_to_tensor) + { + } + +@@ -136,6 +137,7 @@ void GraphLoader::loadTensors() + const luci::CircleQuantParam *params = node->quantparam(); + quantization.scale.assign(params->scale.cbegin(), params->scale.cend()); + quantization.zero_point.assign(params->zerop.cbegin(), params->zerop.cend()); ++ quantization.quantized_dimension = params->quantized_dimension; + } + + auto tensor = std::make_unique(node->dtype(), std::move(shape), std::move(quantization), +@@ -178,7 +180,7 @@ void GraphLoader::initInputOutputTensors() const + + void GraphLoader::loadOperators() + { +- KernelBuilder kernel_builder(_module_loader, *this); ++ KernelBuilder kernel_builder(_graph_to_runtime_graph, _node_to_tensor); + + // Create kernels for executable nodes. This has to be done in execution order. + for (const loco::Node *loco_node : +@@ -195,11 +197,4 @@ void GraphLoader::loadOperators() + } + } + +-void GraphLoader::load() +-{ +- loadTensors(); +- initInputOutputTensors(); +- loadOperators(); +-} +- + } // namespace luci_interpreter +diff --git a/compiler/luci-interpreter/src/loader/GraphLoader.h b/compiler/luci-interpreter/src/loader/GraphLoader.h +index e0adc0f..89c5bca 100644 +--- a/compiler/luci-interpreter/src/loader/GraphLoader.h ++++ b/compiler/luci-interpreter/src/loader/GraphLoader.h +@@ -27,29 +27,23 @@ + namespace luci_interpreter + { + +-class ModuleLoader; +- + class GraphLoader + { + public: +- GraphLoader(const ModuleLoader &module_loader, const loco::Graph *graph, +- RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir, ++ GraphLoader(const loco::Graph *graph, RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir, ++ const std::unordered_map &graph_to_runtime_graph, + std::unordered_map &node_to_tensor); + +- void load(); +- +- Tensor *getTensorForNode(const loco::Node *node) const { return _node_to_tensor.at(node); } +- +-private: +- void loadOperators(); +- void initInputOutputTensors() const; + void loadTensors(); ++ void initInputOutputTensors() const; ++ void loadOperators(); + +- const ModuleLoader &_module_loader; ++private: + const loco::Graph *_graph; + RuntimeGraph *_runtime_graph; + RuntimeToIR &_runtime_to_ir; + ++ const std::unordered_map &_graph_to_runtime_graph; + std::unordered_map &_node_to_tensor; + }; + +diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.cpp b/compiler/luci-interpreter/src/loader/KernelBuilder.cpp +index 56da961..c19f897 100644 +--- a/compiler/luci-interpreter/src/loader/KernelBuilder.cpp ++++ b/compiler/luci-interpreter/src/loader/KernelBuilder.cpp +@@ -21,6 +21,7 @@ + #include "kernels/AveragePool2D.h" + #include "kernels/Concatenation.h" + #include "kernels/Conv2D.h" ++#include "kernels/DepthToSpace.h" + #include "kernels/DepthwiseConv2D.h" + #include "kernels/Elu.h" + #include "kernels/FullyConnected.h" +@@ -35,6 +36,8 @@ + #include "kernels/Mul.h" + #include "kernels/Pad.h" + #include "kernels/Reshape.h" ++#include "kernels/Reverse.h" ++#include "kernels/Slice.h" + #include "kernels/Softmax.h" + #include "kernels/SpaceToDepth.h" + #include "kernels/Split.h" +@@ -43,8 +46,6 @@ + #include "kernels/Unpack.h" + #include "kernels/Transpose.h" + #include "kernels/TransposeConv.h" +-#include "loader/GraphLoader.h" +-#include "loader/ModuleLoader.h" + + #include + +@@ -68,7 +69,7 @@ static std::vector collectOutputNodes(const luci::CircleNode + + const Tensor *KernelBuilder::getInputTensor(const loco::Node *node) const + { +- const Tensor *tensor = _graph_loader.getTensorForNode(node); ++ const Tensor *tensor = _node_to_tensor.at(node); + assert(tensor != nullptr); + return tensor; + } +@@ -81,7 +82,7 @@ const Tensor *KernelBuilder::getOptionalInputTensor(const loco::Node *node) cons + + Tensor *KernelBuilder::getOutputTensor(const loco::Node *node) const + { +- Tensor *tensor = _graph_loader.getTensorForNode(node); ++ Tensor *tensor = _node_to_tensor.at(node); + assert(tensor != nullptr); + return tensor; + } +@@ -98,7 +99,7 @@ KernelBuilder::getOutputTensors(const std::vector &nodes) co + + RuntimeGraph *KernelBuilder::getRuntimeGraph(const loco::Graph *graph) const + { +- RuntimeGraph *runtime_graph = _module_loader.getRuntimeGraph(graph); ++ RuntimeGraph *runtime_graph = _graph_to_runtime_graph.at(graph); + assert(runtime_graph != nullptr); + return runtime_graph; + } +@@ -120,14 +121,14 @@ std::unique_ptr KernelBuilder::visit(const luci::CircleAdd *node) + std::unique_ptr KernelBuilder::visit(const luci::CircleArgMax *node) + { + assert(node->arity() == 2); +- const Tensor *input1 = getInputTensor(node->input()); +- const Tensor *input2 = getInputTensor(node->dimension()); ++ const Tensor *input = getInputTensor(node->input()); ++ const Tensor *axis = getInputTensor(node->dimension()); + Tensor *output = getOutputTensor(node); + + ArgMaxParams params{}; + params.output_type = node->output_type(); + +- return std::make_unique(input1, input2, output, params); ++ return std::make_unique(input, axis, output, params); + } + + std::unique_ptr KernelBuilder::visit(const luci::CircleAveragePool2D *node) +@@ -188,6 +189,19 @@ std::unique_ptr KernelBuilder::visit(const luci::CircleConv2D *node) + return std::make_unique(input, filter, bias, output, params); + } + ++std::unique_ptr KernelBuilder::visit(const luci::CircleDepthToSpace *node) ++{ ++ assert(node->arity() == 1); ++ ++ const Tensor *input = getInputTensor(node->input()); ++ Tensor *output = getOutputTensor(node); ++ ++ DepthToSpaceParams params{}; ++ params.block_size = node->block_size(); ++ ++ return std::make_unique(input, output, params); ++} ++ + std::unique_ptr KernelBuilder::visit(const luci::CircleDepthwiseConv2D *node) + { + assert(node->arity() == 3); +@@ -224,14 +238,14 @@ std::unique_ptr KernelBuilder::visit(const luci::CircleFullyConnected *n + assert(node->arity() == 3); + + const Tensor *input = getInputTensor(node->input()); +- const Tensor *filter = getInputTensor(node->weights()); ++ const Tensor *weights = getInputTensor(node->weights()); + const Tensor *bias = getOptionalInputTensor(node->bias()); + Tensor *output = getOutputTensor(node); + + FullyConnectedParams params{}; + params.activation = node->fusedActivationFunction(); + +- return std::make_unique(input, filter, bias, output, params); ++ return std::make_unique(input, weights, bias, output, params); + } + + std::unique_ptr KernelBuilder::visit(const luci::CircleIf *node) +@@ -255,6 +269,11 @@ std::unique_ptr KernelBuilder::visit(const luci::CircleIf *node) + else_graph); + } + ++std::unique_ptr KernelBuilder::visit(const luci::CircleInput *) ++{ ++ throw std::runtime_error("Input node cannot be executed."); ++} ++ + std::unique_ptr KernelBuilder::visit(const luci::CircleL2Normalize *node) + { + assert(node->arity() == 1); +@@ -323,11 +342,6 @@ std::unique_ptr KernelBuilder::visit(const luci::CircleLogistic *node) + return std::make_unique(input, output); + } + +-std::unique_ptr KernelBuilder::visit(const luci::CircleInput *) +-{ +- throw std::runtime_error("Input node cannot be executed."); +-} +- + std::unique_ptr KernelBuilder::visit(const luci::CircleMaxPool2D *node) + { + assert(node->arity() == 1); +@@ -402,6 +416,30 @@ std::unique_ptr KernelBuilder::visit(const luci::CircleReshape *node) + return std::make_unique(input, shape, output); + } + ++std::unique_ptr KernelBuilder::visit(const luci::CircleReverseV2 *node) ++{ ++ assert(node->arity() == 2); ++ ++ const Tensor *input = getInputTensor(node->tensor()); ++ const Tensor *axes = getInputTensor(node->axis()); ++ Tensor *output = getOutputTensor(node); ++ ++ return std::make_unique(input, axes, output); ++} ++ ++std::unique_ptr KernelBuilder::visit(const luci::CircleSlice *node) ++{ ++ assert(node->arity() == 3); ++ ++ const Tensor *input = getInputTensor(node->input()); ++ const Tensor *begin = getInputTensor(node->begin()); ++ const Tensor *size = getInputTensor(node->size()); ++ ++ Tensor *output = getOutputTensor(node); ++ ++ return std::make_unique(input, begin, size, output); ++} ++ + std::unique_ptr KernelBuilder::visit(const luci::CircleSoftmax *node) + { + assert(node->arity() == 1); +@@ -442,6 +480,19 @@ std::unique_ptr KernelBuilder::visit(const luci::CircleSplit *node) + return std::make_unique(axis, input, std::move(outputs)); + } + ++std::unique_ptr KernelBuilder::visit(const luci::CircleSqueeze *node) ++{ ++ assert(node->arity() == 1); ++ ++ const Tensor *input = getInputTensor(node->input()); ++ Tensor *output = getOutputTensor(node); ++ ++ SqueezeParams params{}; ++ params.squeeze_dims = node->squeeze_dims(); ++ ++ return std::make_unique(input, output, params); ++} ++ + std::unique_ptr KernelBuilder::visit(const luci::CircleStridedSlice *node) + { + assert(node->arity() == 4); +@@ -463,21 +514,15 @@ std::unique_ptr KernelBuilder::visit(const luci::CircleStridedSlice *nod + return std::make_unique(input, begin, end, strides, output, params); + } + +-std::unique_ptr KernelBuilder::visit(const luci::CircleSqueeze *node) ++std::unique_ptr KernelBuilder::visit(const luci::CircleTranspose *node) + { +- assert(node->arity() == 1); ++ assert(node->arity() == 2); + +- const Tensor *input = getInputTensor(node->input()); ++ const Tensor *input = getInputTensor(node->a()); ++ const Tensor *perm = getInputTensor(node->perm()); + Tensor *output = getOutputTensor(node); + +- SqueezeParams params{}; +- assert(node->squeeze_dims().size() <= 4); +- for (size_t i = 0; i < node->squeeze_dims().size(); i++) +- { +- params.squeeze_dims.push_back(node->squeeze_dims().at(i)); +- } +- +- return std::make_unique(input, output, params); ++ return std::make_unique(input, perm, output); + } + + std::unique_ptr KernelBuilder::visit(const luci::CircleTransposeConv *node) +@@ -515,15 +560,4 @@ std::unique_ptr KernelBuilder::visit(const luci::CircleUnpack *node) + return std::make_unique(input, std::move(outputs), params); + } + +-std::unique_ptr KernelBuilder::visit(const luci::CircleTranspose *node) +-{ +- assert(node->arity() == 2); +- +- const Tensor *input = getInputTensor(node->a()); +- const Tensor *perm = getInputTensor(node->perm()); +- Tensor *output = getOutputTensor(node); +- +- return std::make_unique(input, perm, output); +-} +- + } // namespace luci_interpreter +diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.h b/compiler/luci-interpreter/src/loader/KernelBuilder.h +index 7e30d39..d5c5a4b 100644 +--- a/compiler/luci-interpreter/src/loader/KernelBuilder.h ++++ b/compiler/luci-interpreter/src/loader/KernelBuilder.h +@@ -24,18 +24,18 @@ + + #include + #include ++#include + + namespace luci_interpreter + { + +-class GraphLoader; +-class ModuleLoader; +- + class KernelBuilder : public luci::CircleNodeVisitor> + { + public: +- KernelBuilder(const ModuleLoader &module_loader, const GraphLoader &graph_loader) +- : _module_loader(module_loader), _graph_loader(graph_loader) ++ KernelBuilder( ++ const std::unordered_map &graph_to_runtime_graph, ++ const std::unordered_map &node_to_tensor) ++ : _graph_to_runtime_graph(graph_to_runtime_graph), _node_to_tensor(node_to_tensor) + { + } + +@@ -45,6 +45,7 @@ public: + std::unique_ptr visit(const luci::CircleConcatenation *node) override; + std::unique_ptr visit(const luci::CircleConv2D *node) override; + std::unique_ptr visit(const luci::CircleConst *node) override; ++ std::unique_ptr visit(const luci::CircleDepthToSpace *node) override; + std::unique_ptr visit(const luci::CircleDepthwiseConv2D *node) override; + std::unique_ptr visit(const luci::CircleElu *node) override; + std::unique_ptr visit(const luci::CircleFullyConnected *node) override; +@@ -61,6 +62,8 @@ public: + std::unique_ptr visit(const luci::CircleOutput *node) override; + std::unique_ptr visit(const luci::CirclePad *node) override; + std::unique_ptr visit(const luci::CircleReshape *node) override; ++ std::unique_ptr visit(const luci::CircleReverseV2 *node) override; ++ std::unique_ptr visit(const luci::CircleSlice *node) override; + std::unique_ptr visit(const luci::CircleSoftmax *node) override; + std::unique_ptr visit(const luci::CircleSpaceToDepth *node) override; + std::unique_ptr visit(const luci::CircleSplit *node) override; +@@ -82,8 +85,8 @@ private: + RuntimeGraph *getRuntimeGraph(const loco::Graph *graph) const; + + private: +- const ModuleLoader &_module_loader; +- const GraphLoader &_graph_loader; ++ const std::unordered_map &_graph_to_runtime_graph; ++ const std::unordered_map &_node_to_tensor; + }; + + } // namespace luci_interpreter +diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp b/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp +new file mode 100644 +index 0000000..33bc8ec +--- /dev/null ++++ b/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp +@@ -0,0 +1,743 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#include "loader/GraphLoader.h" ++#include "loader/KernelBuilder.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++namespace luci_interpreter ++{ ++namespace ++{ ++ ++using namespace testing; ++ ++class KernelBuilderTest : public Test ++{ ++protected: ++ luci::CircleInput *createInputNode() { return createNode(); } ++ ++ template NodeT *createNode(Args &&... args) ++ { ++ auto *node = _graph.nodes()->create(std::forward(args)...); ++ // The actual type does not matter for the purpose of the tests. ++ // NOTE The type is meaningless for nodes with multiple outputs (corresponding *Out nodes carry ++ // actual output types). ++ node->dtype(loco::DataType::FLOAT32); ++ return node; ++ } ++ ++ template NodeOutT *createNodeOut(loco::Node *node, int index) ++ { ++ auto *node_out = createNode(); ++ node_out->input(node); ++ node_out->index(index); ++ return node_out; ++ } ++ ++ template std::unique_ptr buildKernel(const luci::CircleNode *op) ++ { ++ std::unordered_map graph_to_runtime_graph; ++ ++ RuntimeGraph runtime_graph(nullptr); ++ RuntimeToIR runtime_to_ir; ++ GraphLoader graph_loader(&_graph, &runtime_graph, runtime_to_ir, graph_to_runtime_graph, ++ _node_to_tensor); ++ graph_loader.loadTensors(); ++ ++ KernelBuilder kernel_builder(graph_to_runtime_graph, _node_to_tensor); ++ ++ auto kernel = op->accept(&kernel_builder); ++ return std::unique_ptr(dynamic_cast(kernel.release())); ++ } ++ ++ void checkTensor(const Tensor *tensor, const loco::Node *node) ++ { ++ EXPECT_THAT(tensor, Eq(_node_to_tensor.at(node))); ++ } ++ ++private: ++ loco::Graph _graph; ++ std::unordered_map _node_to_tensor; ++}; ++ ++TEST_F(KernelBuilderTest, Add) ++{ ++ auto *input1 = createInputNode(); ++ auto *input2 = createInputNode(); ++ ++ auto *op = createNode(); ++ op->x(input1); ++ op->y(input2); ++ ++ op->fusedActivationFunction(luci::FusedActFunc::RELU); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input1(), input1); ++ checkTensor(kernel->input2(), input2); ++ checkTensor(kernel->output(), op); ++ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction())); ++} ++ ++TEST_F(KernelBuilderTest, ArgMax) ++{ ++ auto *input = createInputNode(); ++ auto *axis = createInputNode(); ++ ++ auto *op = createNode(); ++ op->input(input); ++ op->dimension(axis); ++ ++ op->output_type(loco::DataType::FLOAT32); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input(), input); ++ checkTensor(kernel->axis(), axis); ++ checkTensor(kernel->output(), op); ++ EXPECT_THAT(kernel->params().output_type, Eq(op->output_type())); ++} ++ ++TEST_F(KernelBuilderTest, AveragePool2D) ++{ ++ auto *input = createInputNode(); ++ ++ auto *op = createNode(); ++ op->value(input); ++ ++ op->padding(luci::Padding::SAME); ++ op->filter()->h(11); ++ op->filter()->w(13); ++ op->stride()->h(17); ++ op->stride()->w(19); ++ op->fusedActivationFunction(luci::FusedActFunc::RELU); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input(), input); ++ checkTensor(kernel->output(), op); ++ EXPECT_THAT(kernel->params().padding, Eq(op->padding())); ++ EXPECT_THAT(kernel->params().filter_height, Eq(op->filter()->h())); ++ EXPECT_THAT(kernel->params().filter_width, Eq(op->filter()->w())); ++ EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h())); ++ EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w())); ++ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction())); ++} ++ ++TEST_F(KernelBuilderTest, Concatenation) ++{ ++ auto *input1 = createInputNode(); ++ auto *input2 = createInputNode(); ++ ++ auto *op = createNode(2); ++ op->values(0, input1); ++ op->values(1, input2); ++ op->axis(11); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input(0), input1); ++ checkTensor(kernel->input(1), input2); ++ checkTensor(kernel->output(), op); ++ EXPECT_THAT(kernel->params().axis, Eq(op->axis())); ++} ++ ++TEST_F(KernelBuilderTest, Conv2D) ++{ ++ auto *input = createInputNode(); ++ auto *filter = createInputNode(); ++ auto *bias = createInputNode(); ++ ++ auto *op = createNode(); ++ op->input(input); ++ op->filter(filter); ++ op->bias(bias); ++ ++ op->padding(luci::Padding::SAME); ++ op->stride()->h(11); ++ op->stride()->w(13); ++ op->dilation()->h(17); ++ op->dilation()->w(19); ++ op->fusedActivationFunction(luci::FusedActFunc::RELU); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input(), input); ++ checkTensor(kernel->filter(), filter); ++ checkTensor(kernel->bias(), bias); ++ checkTensor(kernel->output(), op); ++ EXPECT_THAT(kernel->params().padding, Eq(op->padding())); ++ EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h())); ++ EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w())); ++ EXPECT_THAT(kernel->params().dilation_height_factor, Eq(op->dilation()->h())); ++ EXPECT_THAT(kernel->params().dilation_width_factor, Eq(op->dilation()->w())); ++ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction())); ++} ++ ++TEST_F(KernelBuilderTest, DepthToSpace) ++{ ++ auto *input = createInputNode(); ++ ++ auto *op = createNode(); ++ op->input(input); ++ ++ op->block_size(11); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input(), input); ++ checkTensor(kernel->output(), op); ++ EXPECT_THAT(kernel->params().block_size, Eq(op->block_size())); ++} ++ ++TEST_F(KernelBuilderTest, DepthwiseConv2D) ++{ ++ auto *input = createInputNode(); ++ auto *filter = createInputNode(); ++ auto *bias = createInputNode(); ++ ++ auto *op = createNode(); ++ op->input(input); ++ op->filter(filter); ++ op->bias(bias); ++ ++ op->padding(luci::Padding::SAME); ++ op->depthMultiplier(11); ++ op->stride()->h(13); ++ op->stride()->w(17); ++ op->dilation()->h(19); ++ op->dilation()->w(23); ++ op->fusedActivationFunction(luci::FusedActFunc::RELU); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input(), input); ++ checkTensor(kernel->filter(), filter); ++ checkTensor(kernel->bias(), bias); ++ checkTensor(kernel->output(), op); ++ EXPECT_THAT(kernel->params().padding, Eq(op->padding())); ++ EXPECT_THAT(kernel->params().depth_multiplier, Eq(op->depthMultiplier())); ++ EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h())); ++ EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w())); ++ EXPECT_THAT(kernel->params().dilation_height_factor, Eq(op->dilation()->h())); ++ EXPECT_THAT(kernel->params().dilation_width_factor, Eq(op->dilation()->w())); ++ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction())); ++} ++ ++TEST_F(KernelBuilderTest, Elu) ++{ ++ auto *input = createInputNode(); ++ ++ auto *op = createNode(); ++ op->features(input); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input(), input); ++ checkTensor(kernel->output(), op); ++} ++ ++TEST_F(KernelBuilderTest, FullyConnected) ++{ ++ auto *input = createInputNode(); ++ auto *weights = createInputNode(); ++ auto *bias = createInputNode(); ++ ++ auto *op = createNode(); ++ op->input(input); ++ op->weights(weights); ++ op->bias(bias); ++ ++ op->fusedActivationFunction(luci::FusedActFunc::RELU); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input(), input); ++ checkTensor(kernel->weights(), weights); ++ checkTensor(kernel->bias(), bias); ++ checkTensor(kernel->output(), op); ++ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction())); ++} ++ ++TEST_F(KernelBuilderTest, L2Normalize) ++{ ++ auto *input = createInputNode(); ++ ++ auto *op = createNode(); ++ op->x(input); ++ ++ op->fusedActivationFunction(luci::FusedActFunc::RELU); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input(), input); ++ checkTensor(kernel->output(), op); ++ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction())); ++} ++ ++TEST_F(KernelBuilderTest, L2Pool2D) ++{ ++ auto *input = createInputNode(); ++ ++ auto *op = createNode(); ++ op->value(input); ++ ++ op->padding(luci::Padding::SAME); ++ op->filter()->h(11); ++ op->filter()->w(13); ++ op->stride()->h(17); ++ op->stride()->w(19); ++ op->fusedActivationFunction(luci::FusedActFunc::RELU); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input(), input); ++ checkTensor(kernel->output(), op); ++ EXPECT_THAT(kernel->params().padding, Eq(op->padding())); ++ EXPECT_THAT(kernel->params().filter_height, Eq(op->filter()->h())); ++ EXPECT_THAT(kernel->params().filter_width, Eq(op->filter()->w())); ++ EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h())); ++ EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w())); ++ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction())); ++} ++ ++TEST_F(KernelBuilderTest, LeakyRelu) ++{ ++ auto *input = createInputNode(); ++ ++ auto *op = createNode(); ++ op->features(input); ++ ++ op->alpha(11.0f); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input(), input); ++ checkTensor(kernel->output(), op); ++ EXPECT_THAT(kernel->params().alpha, Eq(op->alpha())); ++} ++ ++TEST_F(KernelBuilderTest, LocalResponseNormalization) ++{ ++ auto *input = createInputNode(); ++ ++ auto *op = createNode(); ++ op->input(input); ++ ++ op->radius(11); ++ op->bias(13.0f); ++ op->alpha(15.0f); ++ op->beta(17.0f); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input(), input); ++ checkTensor(kernel->output(), op); ++ EXPECT_THAT(kernel->params().radius, Eq(op->radius())); ++ EXPECT_THAT(kernel->params().bias, Eq(op->bias())); ++ EXPECT_THAT(kernel->params().alpha, Eq(op->alpha())); ++ EXPECT_THAT(kernel->params().beta, Eq(op->beta())); ++} ++ ++TEST_F(KernelBuilderTest, Logistic) ++{ ++ auto *input = createInputNode(); ++ ++ auto *op = createNode(); ++ op->x(input); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input(), input); ++ checkTensor(kernel->output(), op); ++} ++ ++TEST_F(KernelBuilderTest, MaxPool2D) ++{ ++ auto *input = createInputNode(); ++ ++ auto *op = createNode(); ++ op->value(input); ++ ++ op->padding(luci::Padding::SAME); ++ op->filter()->h(11); ++ op->filter()->w(13); ++ op->stride()->h(17); ++ op->stride()->w(19); ++ op->fusedActivationFunction(luci::FusedActFunc::RELU); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input(), input); ++ checkTensor(kernel->output(), op); ++ EXPECT_THAT(kernel->params().padding, Eq(op->padding())); ++ EXPECT_THAT(kernel->params().filter_height, Eq(op->filter()->h())); ++ EXPECT_THAT(kernel->params().filter_width, Eq(op->filter()->w())); ++ EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h())); ++ EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w())); ++ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction())); ++} ++ ++TEST_F(KernelBuilderTest, Mean) ++{ ++ auto *input = createInputNode(); ++ auto *axes = createInputNode(); ++ ++ auto *op = createNode(); ++ op->input(input); ++ op->reduction_indices(axes); ++ ++ op->keep_dims(true); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input(), input); ++ checkTensor(kernel->axes(), axes); ++ checkTensor(kernel->output(), op); ++ EXPECT_THAT(kernel->params().keep_dims, Eq(op->keep_dims())); ++} ++ ++TEST_F(KernelBuilderTest, Mul) ++{ ++ auto *input1 = createInputNode(); ++ auto *input2 = createInputNode(); ++ ++ auto *op = createNode(); ++ op->x(input1); ++ op->y(input2); ++ ++ op->fusedActivationFunction(luci::FusedActFunc::RELU); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input1(), input1); ++ checkTensor(kernel->input2(), input2); ++ checkTensor(kernel->output(), op); ++ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction())); ++} ++ ++TEST_F(KernelBuilderTest, Pad) ++{ ++ auto *input = createInputNode(); ++ auto *paddings = createInputNode(); ++ ++ auto *op = createNode(); ++ op->input(input); ++ op->paddings(paddings); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input(), input); ++ checkTensor(kernel->paddings(), paddings); ++ checkTensor(kernel->output(), op); ++} ++ ++TEST_F(KernelBuilderTest, Reshape) ++{ ++ auto *input = createInputNode(); ++ auto *shape = createInputNode(); ++ ++ auto *op = createNode(); ++ op->tensor(input); ++ op->shape(shape); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input(), input); ++ checkTensor(kernel->shape(), shape); ++ checkTensor(kernel->output(), op); ++} ++ ++TEST_F(KernelBuilderTest, ReverseV2) ++{ ++ auto *input = createInputNode(); ++ auto *axes = createInputNode(); ++ ++ auto *op = createNode(); ++ op->tensor(input); ++ op->axis(axes); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input(), input); ++ checkTensor(kernel->axes(), axes); ++ checkTensor(kernel->output(), op); ++} ++ ++TEST_F(KernelBuilderTest, Slice) ++{ ++ auto *input = createInputNode(); ++ auto *begin = createInputNode(); ++ auto *size = createInputNode(); ++ ++ auto *op = createNode(); ++ op->input(input); ++ op->begin(begin); ++ op->size(size); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input(), input); ++ checkTensor(kernel->begin(), begin); ++ checkTensor(kernel->size(), size); ++ checkTensor(kernel->output(), op); ++} ++ ++TEST_F(KernelBuilderTest, Softmax) ++{ ++ auto *input = createInputNode(); ++ ++ auto *op = createNode(); ++ op->logits(input); ++ ++ op->beta(11.0f); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input(), input); ++ checkTensor(kernel->output(), op); ++ EXPECT_THAT(kernel->params().beta, Eq(op->beta())); ++} ++ ++TEST_F(KernelBuilderTest, SpaceToDepth) ++{ ++ auto *input = createInputNode(); ++ ++ auto *op = createNode(); ++ op->input(input); ++ ++ op->block_size(11); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input(), input); ++ checkTensor(kernel->output(), op); ++ EXPECT_THAT(kernel->params().block_size, op->block_size()); ++} ++ ++TEST_F(KernelBuilderTest, Split) ++{ ++ auto *axis = createInputNode(); ++ auto *input = createInputNode(); ++ auto *op = createNode(); ++ auto *output1 = createNodeOut(op, 0); ++ auto *output2 = createNodeOut(op, 1); ++ ++ op->split_dim(axis); ++ op->input(input); ++ ++ op->num_split(2); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->axis(), axis); ++ checkTensor(kernel->input(), input); ++ checkTensor(kernel->output(0), output1); ++ checkTensor(kernel->output(1), output2); ++} ++ ++TEST_F(KernelBuilderTest, Squeeze) ++{ ++ auto *input = createInputNode(); ++ ++ auto *op = createNode(); ++ op->input(input); ++ ++ op->squeeze_dims({11, 13}); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input(), input); ++ checkTensor(kernel->output(), op); ++ EXPECT_THAT(kernel->params().squeeze_dims, ElementsAreArray(op->squeeze_dims())); ++} ++ ++TEST_F(KernelBuilderTest, StridedSlice) ++{ ++ auto *input = createInputNode(); ++ auto *begin = createInputNode(); ++ auto *end = createInputNode(); ++ auto *strides = createInputNode(); ++ ++ auto *op = createNode(); ++ op->input(input); ++ op->begin(begin); ++ op->end(end); ++ op->strides(strides); ++ ++ op->begin_mask(11); ++ op->ellipsis_mask(13); ++ op->end_mask(17); ++ op->new_axis_mask(19); ++ op->shrink_axis_mask(23); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input(), input); ++ checkTensor(kernel->begin(), begin); ++ checkTensor(kernel->end(), end); ++ checkTensor(kernel->strides(), strides); ++ checkTensor(kernel->output(), op); ++ EXPECT_THAT(kernel->params().begin_mask, Eq(op->begin_mask())); ++ EXPECT_THAT(kernel->params().ellipsis_mask, Eq(op->ellipsis_mask())); ++ EXPECT_THAT(kernel->params().end_mask, Eq(op->end_mask())); ++ EXPECT_THAT(kernel->params().new_axis_mask, Eq(op->new_axis_mask())); ++ EXPECT_THAT(kernel->params().shrink_axis_mask, Eq(op->shrink_axis_mask())); ++} ++ ++TEST_F(KernelBuilderTest, Transpose) ++{ ++ auto *input = createInputNode(); ++ auto *perm = createInputNode(); ++ ++ auto *op = createNode(); ++ op->a(input); ++ op->perm(perm); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input(), input); ++ checkTensor(kernel->perm(), perm); ++ checkTensor(kernel->output(), op); ++} ++ ++TEST_F(KernelBuilderTest, TransposeConv) ++{ ++ auto *output_shape = createInputNode(); ++ auto *filter = createInputNode(); ++ auto *input = createInputNode(); ++ ++ auto *op = createNode(); ++ op->inputSizes(output_shape); ++ op->filter(filter); ++ op->outBackprop(input); ++ ++ op->padding(luci::Padding::SAME); ++ op->stride()->h(11); ++ op->stride()->w(13); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->output_shape(), output_shape); ++ checkTensor(kernel->filter(), filter); ++ checkTensor(kernel->input(), input); ++ checkTensor(kernel->output(), op); ++ EXPECT_THAT(kernel->params().padding, Eq(op->padding())); ++ EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h())); ++ EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w())); ++} ++ ++TEST_F(KernelBuilderTest, Unpack) ++{ ++ auto *input = createInputNode(); ++ auto *op = createNode(); ++ auto *output1 = createNodeOut(op, 0); ++ auto *output2 = createNodeOut(op, 1); ++ ++ op->value(input); ++ ++ op->num(2); ++ op->axis(11); ++ ++ auto kernel = buildKernel(op); ++ ASSERT_THAT(kernel, NotNull()); ++ ++ checkTensor(kernel->input(), input); ++ checkTensor(kernel->output(0), output1); ++ checkTensor(kernel->output(1), output2); ++ EXPECT_THAT(kernel->params().axis, Eq(op->axis())); ++} ++ ++TEST_F(KernelBuilderTest, NonExisting1_NEG) ++{ ++ auto *op = createNode(); ++ ASSERT_ANY_THROW(buildKernel(op)); ++} ++ ++TEST_F(KernelBuilderTest, NonExisting2_NEG) ++{ ++ auto *op = createNode(); ++ ASSERT_ANY_THROW(buildKernel(op)); ++} ++ ++TEST_F(KernelBuilderTest, NonExisting3_NEG) ++{ ++ auto *op = createNode(); ++ ASSERT_ANY_THROW(buildKernel(op)); ++} ++ ++} // namespace ++} // namespace luci_interpreter +diff --git a/compiler/luci-interpreter/src/loader/ModuleLoader.cpp b/compiler/luci-interpreter/src/loader/ModuleLoader.cpp +index 7780a61..b9a2ae0 100644 +--- a/compiler/luci-interpreter/src/loader/ModuleLoader.cpp ++++ b/compiler/luci-interpreter/src/loader/ModuleLoader.cpp +@@ -41,8 +41,11 @@ void ModuleLoader::load() + { + const loco::Graph *graph = _module->graph(i); + RuntimeGraph *runtime_graph = _graph_to_runtime_graph.at(graph); +- GraphLoader loader(*this, graph, runtime_graph, _runtime_to_ir, _node_to_tensor); +- loader.load(); ++ GraphLoader loader(graph, runtime_graph, _runtime_to_ir, _graph_to_runtime_graph, ++ _node_to_tensor); ++ loader.loadTensors(); ++ loader.initInputOutputTensors(); ++ loader.loadOperators(); + } + } + +diff --git a/compiler/luci-interpreter/src/loader/ModuleLoader.h b/compiler/luci-interpreter/src/loader/ModuleLoader.h +index 954dbfb..1af0ed7 100644 +--- a/compiler/luci-interpreter/src/loader/ModuleLoader.h ++++ b/compiler/luci-interpreter/src/loader/ModuleLoader.h +@@ -36,11 +36,6 @@ public: + + void load(); + +- RuntimeGraph *getRuntimeGraph(const loco::Graph *graph) const +- { +- return _graph_to_runtime_graph.at(graph); +- } +- + private: + const luci::Module *_module; + RuntimeModule *_runtime_module; +diff --git a/compiler/luci-value-test/evalverify.sh b/compiler/luci-value-test/evalverify.sh +index dfd55a6..12c9a45 100755 +--- a/compiler/luci-value-test/evalverify.sh ++++ b/compiler/luci-value-test/evalverify.sh +@@ -4,8 +4,10 @@ + # + # HOW TO USE + # +-# ./evalverify.sh ... +-# work_dir : build directory of luci-value-test (ex: build/compiler/luci-value-test) ++# ./evalverify.sh ... ++# bin_dir : build directory of luci-value-test (ex: build/compiler/luci-value-test) ++# work_dir : artifacts directoy where test materials exist ++# venv_dir : python virtual environment home directory + + VERIFY_SOURCE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + VERIFY_SCRIPT_PATH="${VERIFY_SOURCE_PATH}/luci_eval_verifier.py" +diff --git a/compiler/luci-value-test/test.lst b/compiler/luci-value-test/test.lst +index 6a332f9..364d881 100644 +--- a/compiler/luci-value-test/test.lst ++++ b/compiler/luci-value-test/test.lst +@@ -1,6 +1,8 @@ + #addeval(Abs_000) + addeval(Add_000) ++#addeval(Add_001) + addeval(Add_U8_000) ++#addeval(AddN_000) + #addeval(ArgMax_000) + #addeval(ArgMax_001) + #addeval(ArgMax_002) +@@ -9,73 +11,173 @@ addeval(Add_U8_000) + #addeval(ArgMax_U8_001) + #addeval(ArgMax_U8_002) + #addeval(ArgMax_U8_003) ++#addeval(ArgMin_000) ++#addeval(ArgMin_001) ++#addeval(ArgMin_002) ++#addeval(ArgMin_003) ++#addeval(ArgMin_U8_000) ++#addeval(ArgMin_U8_001) ++#addeval(ArgMin_U8_002) ++#addeval(ArgMin_U8_003) + addeval(AveragePool2D_000) ++#addeval(BatchMatMul_000) + #addeval(BatchMatMulV2_000) + #addeval(BatchMatMulV2_001) + #addeval(BatchToSpaceND_000) + #addeval(Cast_000) ++#addeval(Cast_001) ++#addeval(Ceil_000) + addeval(Concatenation_000) + addeval(Concatenation_U8_000) + addeval(Conv2D_000) + addeval(Conv2D_001) + addeval(Conv2D_002) ++#addeval(Conv2D_003) + addeval(Conv2D_U8_000) + addeval(Conv2D_U8_001) + #addeval(Cos_000) ++#addeval(DepthToSpace_000) + addeval(DepthwiseConv2D_000) + addeval(DepthwiseConv2D_U8_000) ++#addeval(DepthwiseConv2D_U8_001) ++addeval(DepthwiseConv2D_001) + #addeval(Div_000) ++#addeval(ELU_000) + #addeval(Equal_000) + #addeval(Exp_000) ++#addeval(ExpandDims_000) ++#addeval(ExpandDims_001) ++#addeval(ExpandDims_002) ++#addeval(ExpandDims_003) ++#addeval(Fill_000) ++#addeval(Fill_001) ++#addeval(Floor_000) ++#addeval(FloorDiv_000) ++#addeval(FloorDiv_001) ++#addeval(FloorMod_000) ++#addeval(FloorMod_001) + addeval(FullyConnected_000) + addeval(FullyConnected_001) + #addeval(FullyConnected_002) + #addeval(FullyConnected_U8_000) + #addeval(Gather_000) ++#addeval(GatherNd_000) ++#addeval(Greater_000) ++#addeval(GreaterEqual_000) + #addeval(If_000) + #addeval(If_001) ++addeval(L2Normalize_000) ++addeval(L2Pool2D_000) ++#addeval(L2Pool2D_U8_000) ++#addeval(LeakyRelu_000) ++#addeval(Less_000) ++#addeval(LessEqual_000) ++#addeval(LocalResponseNormalization_000) ++#addeval(Log_000) ++#addeval(LogicalAnd_000) + #addeval(LogicalNot_000) + #addeval(LogicalOr_000) +-#addeval(Logistic_000) ++addeval(Logistic_000) ++#addeval(LogSoftmax_000) ++#addeval(MatMul_000) ++#addeval(MatrixDiag_000) ++#addeval(MatrixSetDiag_000) ++#addeval(Maximum_000) + addeval(MaxPool2D_000) + addeval(MaxPool2D_U8_000) + addeval(Mean_000) + addeval(Mean_001) + addeval(Mean_U8_000) ++#addeval(Minimum_000) ++#addeval(MirrorPad_000) + addeval(Mul_000) + #addeval(Mul_U8_000) ++#addeval(Neg_000) ++#addeval(NotEqual_000) ++#addeval(OneHot_000) ++#addeval(OneHot_001) ++#addeval(OneHot_002) ++#addeval(OneHot_003) + #addeval(Pack_000) + #addeval(Pack_U8_000) + addeval(Pad_000) + addeval(Pad_U8_000) ++#addeval(Pow_000) ++#addeval(PRelu_000) ++#addeval(Range_000) ++#addeval(Rank_000) ++#addeval(ReduceAny_000) ++#addeval(ReduceAny_001) ++#addeval(ReduceAny_002) ++#addeval(ReduceAny_003) ++#addeval(ReduceMax_000) ++#addeval(ReduceMin_000) + #addeval(ReduceProd_000) + #addeval(ReduceProd_001) + #addeval(ReduceProd_002) + #addeval(ReduceProd_003) + #addeval(ReLU_000) ++#addeval(ReLU6_000) ++#addeval(ReLUN1To1_000) + addeval(Reshape_000) + addeval(Reshape_001) + addeval(Reshape_002) + #addeval(Reshape_003) + addeval(Reshape_U8_000) ++#addeval(ResizeBilinear_000) ++#addeval(ResizeNearestNeighbor_000) ++#addeval(ReverseSequence_000) ++#addeval(ReverseV2_000) ++#addeval(Round_000) + #addeval(Rsqrt_000) ++#addeval(ScatterNd_000) ++#addeval(SegmentSum_000) ++#addeval(Select_000) ++#addeval(Select_001) ++#addeval(Select_002) ++#addeval(SelectV2_000) ++#addeval(SelectV2_001) ++#addeval(SelectV2_002) ++#addeval(Shape_000) + #addeval(Sin_000) ++addeval(Slice_000) + addeval(Softmax_000) + #addeval(Softmax_U8_000) + #addeval(SpaceToBatchND_000) + #addeval(SpaceToBatchND_001) + #addeval(SpaceToBatchND_002) + #addeval(SpaceToBatchND_003) +-#addeval(StridedSlice_000) +-#addeval(StridedSlice_001) ++#addeval(SpaceToDepth_000) ++#addeval(SparseToDense_000) ++#addeval(Split_000) ++#addeval(SplitV_000) ++#addeval(Sqrt_000) ++#addeval(Square_000) ++#addeval(SquaredDifference_000) ++addeval(Squeeze_000) ++addeval(StridedSlice_000) ++addeval(StridedSlice_001) ++addeval(StridedSlice_002) + #addeval(Sub_000) + #addeval(Sub_U8_000) ++#addeval(Sum_000) ++#addeval(Sum_001) + #addeval(Tanh_000) + #addeval(Tile_000) + #addeval(Tile_U8_000) +-#addeval(Transpose_000) ++#addeval(TopKV2_000) ++#addeval(TopKV2_001) ++addeval(Transpose_000) ++#addeval(TransposeConv_000) + #addeval(Unpack_000) + #addeval(Unpack_001) + #addeval(Unpack_002) ++addeval(Unpack_003) ++#addeval(Where_000) ++#addeval(Where_001) + #addeval(While_000) + #addeval(While_001) ++#addeval(While_002) ++#addeval(While_003) ++#addeval(YUV_TO_RGB_U8_000) ++#addeval(ZerosLike_000) +diff --git a/compiler/luci/export/src/CircleOperationExporter.cpp b/compiler/luci/export/src/CircleOperationExporter.cpp +index 3c01b67..344c99f 100644 +--- a/compiler/luci/export/src/CircleOperationExporter.cpp ++++ b/compiler/luci/export/src/CircleOperationExporter.cpp +@@ -890,7 +890,7 @@ void OperationExporter::visit(luci::CircleSpaceToDepth *node) + { + export_simple(node, circle::BuiltinOperator_SPACE_TO_DEPTH, + circle::BuiltinOptions_SpaceToDepthOptions, +- CreateSpaceToDepthOptions(builder).Union()); ++ CreateSpaceToDepthOptions(builder, node->block_size()).Union()); + } + + void OperationExporter::visit(luci::CircleSparseToDense *node) +diff --git a/compiler/luci/export/src/CircleTensorExporter.cpp b/compiler/luci/export/src/CircleTensorExporter.cpp +index 5cad392..dc8c2fb 100644 +--- a/compiler/luci/export/src/CircleTensorExporter.cpp ++++ b/compiler/luci/export/src/CircleTensorExporter.cpp +@@ -302,7 +302,10 @@ encodeQuantizationParameters(FlatBufferBuilder &builder, luci::CircleQuantParam + scale = builder.CreateVector(quantparam->scale); + zero_point = builder.CreateVector(quantparam->zerop); + } +- return circle::CreateQuantizationParameters(builder, min, max, scale, zero_point); ++ // Note: QuantizationDetails is not supported ++ return circle::CreateQuantizationParameters(builder, min, max, scale, zero_point, ++ circle::QuantizationDetails::QuantizationDetails_NONE, ++ 0, quantparam->quantized_dimension); + } + + void exportOpDefinedTensor(const CircleTensoInfo &info, FlatBufferBuilder &builder, +diff --git a/compiler/luci/import/src/CircleReader.cpp b/compiler/luci/import/src/CircleReader.cpp +index 81e945d..bc7f397 100644 +--- a/compiler/luci/import/src/CircleReader.cpp ++++ b/compiler/luci/import/src/CircleReader.cpp +@@ -156,6 +156,7 @@ luci_quantparam(const circle::QuantizationParametersT *quantization) + const auto &max = quantization->max; + const auto &scale = quantization->scale; + const auto &zero_point = quantization->zero_point; ++ const auto &quantized_dimension = quantization->quantized_dimension; + + if ((!min.empty() && !max.empty()) || (!scale.empty() && !zero_point.empty())) + { +@@ -165,6 +166,7 @@ luci_quantparam(const circle::QuantizationParametersT *quantization) + quantparam->max = max; + quantparam->scale = scale; + quantparam->zerop = zero_point; ++ quantparam->quantized_dimension = quantized_dimension; + + return quantparam; + } +diff --git a/compiler/luci/import/src/Importer.test.cpp b/compiler/luci/import/src/Importer.test.cpp +index 4426e15..8366546 100644 +--- a/compiler/luci/import/src/Importer.test.cpp ++++ b/compiler/luci/import/src/Importer.test.cpp +@@ -20,4 +20,9 @@ + + #include + +-TEST(TensorFlowLiteImport, Dummy) { luci::Importer import; } ++TEST(TensorFlowLiteImport, Dummy) ++{ ++ luci::Importer import; ++ ++ SUCCEED(); ++} +diff --git a/compiler/luci/import/src/Nodes/CircleLogistic.cpp b/compiler/luci/import/src/Nodes/CircleLogistic.cpp +index 85e7e55..c77c55e 100644 +--- a/compiler/luci/import/src/Nodes/CircleLogistic.cpp ++++ b/compiler/luci/import/src/Nodes/CircleLogistic.cpp +@@ -32,21 +32,7 @@ bool CircleLogisticGraphBuilder::validate(const ValidateArgs &args) const + if (outputs.size() != 1) + return false; + +- // Must be one of the following types +- // float16, float32, float64, complex64, or complex128 + const auto &tensors = args.reader.tensors(); +- const auto &tensor = tensors.at(inputs[0]); +- switch (tensor->type) +- { +- case circle::TensorType_FLOAT16: +- case circle::TensorType_FLOAT32: +- case circle::TensorType_FLOAT64: +- case circle::TensorType_COMPLEX64: +- break; +- default: +- return false; +- } +- + if (tensors.at(inputs[0])->type != tensors.at(outputs[0])->type) + return false; + +diff --git a/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp b/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp +index 7bdf46d..eb0956c 100644 +--- a/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp ++++ b/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp +@@ -30,6 +30,24 @@ bool CircleTransposeConvGraphBuilder::validate(const ValidateArgs &args) const + if (args.op.inputs.size() != 3) + return false; + ++ const auto &inputs = args.op.inputs; ++ const auto &tensors = args.reader.tensors(); ++ const auto &filter_tensor = tensors.at(inputs[1]); ++ const auto &filter_shape = filter_tensor.get()->shape; ++ const auto &ifm_tensor = tensors.at(inputs[2]); ++ const auto &ifm_shape = ifm_tensor.get()->shape; ++ ++ // ifm and filters must be 4-D tensor ++ if (ifm_shape.size() != 4) ++ return false; ++ if (filter_shape.size() != 4) ++ return false; ++ ++ // input shape : [batch, height, width, in_channels] ++ // filters shape : [output_channels, height, weight, in_channels] ++ if (ifm_tensor.get()->shape.at(3) != filter_tensor.get()->shape.at(3)) ++ return false; ++ + return true; + } + +diff --git a/compiler/luci/lang/include/luci/IR/CircleNodes.lst b/compiler/luci/lang/include/luci/IR/CircleNodes.lst +index 488dcfb..acd7921 100644 +--- a/compiler/luci/lang/include/luci/IR/CircleNodes.lst ++++ b/compiler/luci/lang/include/luci/IR/CircleNodes.lst +@@ -120,6 +120,7 @@ CIRCLE_NODE(BCQ_FULLY_CONNECTED, luci::CircleBCQFullyConnected) + CIRCLE_NODE(BCQ_GATHER, luci::CircleBCQGather) + CIRCLE_NODE(INSTANCE_NORM, luci::CircleInstanceNorm) + // Virtual node(s) ++CIRCLE_NODE(CIRCLECONST, void) + CIRCLE_NODE(CIRCLEINPUT, luci::CircleInput) + CIRCLE_NODE(CIRCLEOUTPUT, luci::CircleOutput) + CIRCLE_NODE(CIRCLEOUTPUTDUMMY, luci::CircleOutputDummy) +diff --git a/compiler/luci/lang/include/luci/IR/CircleQuantParam.h b/compiler/luci/lang/include/luci/IR/CircleQuantParam.h +index 7253e65..6944373 100644 +--- a/compiler/luci/lang/include/luci/IR/CircleQuantParam.h ++++ b/compiler/luci/lang/include/luci/IR/CircleQuantParam.h +@@ -29,6 +29,7 @@ struct CircleQuantParam + std::vector max; + std::vector scale; + std::vector zerop; ++ int32_t quantized_dimension{0}; + }; + + } // namespace luci +diff --git a/compiler/luci/lang/src/Module.test.cpp b/compiler/luci/lang/src/Module.test.cpp +index 26bf073..a5973e5 100644 +--- a/compiler/luci/lang/src/Module.test.cpp ++++ b/compiler/luci/lang/src/Module.test.cpp +@@ -22,7 +22,7 @@ TEST(ModuleTest, consturctor) + { + auto gs = luci::make_module(); + +- GTEST_SUCCEED(); ++ SUCCEED(); + } + + TEST(ModuleTest, add) +diff --git a/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp b/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp +index 74ea82c..c07268c 100644 +--- a/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp ++++ b/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp +@@ -35,7 +35,12 @@ TEST(CircleCustomTest, constructor) + ASSERT_EQ(0, custom_node.custom_code().size()); + } + +-TEST(CircleCustomTest, constructor_NEG) { ASSERT_DEBUG_DEATH(luci::CircleCustom{0}, ""); } ++TEST(CircleCustomTest, constructor_NEG) ++{ ++ ASSERT_DEBUG_DEATH(luci::CircleCustom{0}, ""); ++ ++ SUCCEED(); ++} + + TEST(CircleCustomTest, invalidIndex_NEG) + { +diff --git a/compiler/luci/lang/src/Nodes/CircleIf.test.cpp b/compiler/luci/lang/src/Nodes/CircleIf.test.cpp +index e3c8c9f..35f28e9 100644 +--- a/compiler/luci/lang/src/Nodes/CircleIf.test.cpp ++++ b/compiler/luci/lang/src/Nodes/CircleIf.test.cpp +@@ -41,11 +41,15 @@ TEST(CircleIfTest, constructor) + TEST(CircleIfTestDeath, invalid_arity_NEG) + { + ASSERT_DEBUG_DEATH(luci::CircleIf very_long_name_if_node(0, 1), ""); ++ ++ SUCCEED(); + } + + TEST(CircleIfTestDeath, invalid_output_count_NEG) + { + ASSERT_DEBUG_DEATH(luci::CircleIf if_node(2, 0), ""); ++ ++ SUCCEED(); + } + + TEST(CircleIfTestDeath, invalid_input_get_index_NEG) +diff --git a/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp b/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp +index 19290c0..913686f 100644 +--- a/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp ++++ b/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp +@@ -41,11 +41,15 @@ TEST(CircleWhileTest, constructor) + TEST(CircleWhileTestDeath, invalid_arity_NEG) + { + ASSERT_DEBUG_DEATH(luci::CircleWhile very_long_name_while_node(0, 1), ""); ++ ++ SUCCEED(); + } + + TEST(CircleWhileTestDeath, invalid_output_count_NEG) + { + ASSERT_DEBUG_DEATH(luci::CircleWhile while_node(2, 0), ""); ++ ++ SUCCEED(); + } + + TEST(CircleWhileTestDeath, invalid_input_get_index_NEG) +diff --git a/compiler/luci/pass/src/CircleOptimizer.cpp b/compiler/luci/pass/src/CircleOptimizer.cpp +index 90fbe90..2edf7a9 100644 +--- a/compiler/luci/pass/src/CircleOptimizer.cpp ++++ b/compiler/luci/pass/src/CircleOptimizer.cpp +@@ -145,7 +145,7 @@ void CircleOptimizer::quantize(loco::Graph *g) const + { + static const std::vector fakeq_supported_input_dtype{"float32"}; + static const std::vector fakeq_supported_output_dtype{"uint8"}; +- static const std::vector fakeq_supported_granularity{"layer"}; ++ static const std::vector fakeq_supported_granularity{"layer", "channel"}; + + auto input_dtype = _options->param(Options::AlgorithmParameters::Quantize_input_dtype); + auto output_dtype = _options->param(Options::AlgorithmParameters::Quantize_output_dtype); +@@ -173,7 +173,7 @@ void CircleOptimizer::quantize(loco::Graph *g) const + { + static const std::vector qwmm_supported_input_dtype{"float32"}; + static const std::vector qwmm_supported_output_dtype{"uint8"}; +- static const std::vector qwmm_supported_granularity{"layer"}; ++ static const std::vector qwmm_supported_granularity{"layer", "channel"}; + + auto input_dtype = _options->param(Options::AlgorithmParameters::Quantize_input_dtype); + auto output_dtype = _options->param(Options::AlgorithmParameters::Quantize_output_dtype); +diff --git a/compiler/luci/pass/src/FuseBCQPass.cpp b/compiler/luci/pass/src/FuseBCQPass.cpp +index b81db88..edbaefa 100644 +--- a/compiler/luci/pass/src/FuseBCQPass.cpp ++++ b/compiler/luci/pass/src/FuseBCQPass.cpp +@@ -67,14 +67,190 @@ const std::string node_name_prefix(luci::NodeName node_name) + return prefix; + } + ++/** ++ * @brief Create CircleOutputExclude operation, which has same shape and dtype with ++ * original circle_node. ++ */ ++luci::CircleOutputExclude *createNoOp(luci::CircleNode *circle_node) ++{ ++ auto graph = circle_node->graph(); ++ auto noOp = graph->nodes()->create(); ++ ++ if (circle_node->shape_status() == luci::ShapeStatus::VALID) ++ { ++ noOp->dtype(circle_node->dtype()); ++ noOp->rank(circle_node->rank()); ++ for (uint32_t i = 0; i < circle_node->rank(); ++i) ++ noOp->dim(i) = circle_node->dim(i); ++ } ++ else ++ { ++ // For type inference ++ noOp->dtype(loco::DataType::FLOAT32); ++ } ++ ++ return noOp; ++}; ++ + } // namespace + + namespace + { + +-class BCQConverter final ++// V means the version of BCQ. ++template class BCQFuser; ++ ++template <> class BCQFuser<1> + { + public: ++ bool fuseBCQ(loco::Graph *g) ++ { ++ bool changed = false; ++ ++ for (auto node : loco::all_nodes(g)) ++ { ++ if (auto circle_const = dynamic_cast(node)) ++ { ++ add_BCQ_info_node(circle_const); ++ } ++ } ++ ++ if (!is_bcqinfo_valid()) ++ return false; ++ ++ for (auto node : loco::active_nodes(loco::output_nodes(g))) ++ { ++ if (auto gather = dynamic_cast(node)) ++ { ++ auto params = dynamic_cast(gather->params()); ++ if (params != nullptr && has_BCQ_info(params)) ++ { ++ auto bcq_gather = g->nodes()->create(); ++ ++ bcq_gather->op_version(1); ++ bcq_gather->input_scales(get_alpha(params)); ++ bcq_gather->input_binary(get_packed_binary_code(params)); ++ bcq_gather->indices(gather->indices()); ++ bcq_gather->input_clusters(packed_clusters(params)); ++ ++ // input_binary shape : [output_size, hidden_size] ++ const auto binary_hidden_size = ++ loco::must_cast(bcq_gather->input_binary())->dim(1).value() * 32; ++ bcq_gather->input_hidden_size(binary_hidden_size); ++ ++ if (do_w_x(params)) ++ { ++ bcq_gather->axis(gather->axis()); ++ } ++ else ++ { ++ const auto axis_transpose = (gather->axis() == 0) ? 1 : 0; ++ bcq_gather->axis(axis_transpose); ++ } ++ ++ loco::replace(gather).with(bcq_gather); ++ ++ changed = true; ++ } ++ } ++ else if (auto fully_connected = dynamic_cast(node)) ++ { ++ auto weights = dynamic_cast(fully_connected->weights()); ++ if (weights != nullptr && has_BCQ_info(weights)) ++ { ++ auto bcq_fc = g->nodes()->create(); ++ ++ bcq_fc->op_version(1); ++ bcq_fc->weights_scales(get_alpha(weights)); ++ bcq_fc->weights_binary(get_packed_binary_code(weights)); ++ bcq_fc->bias(fully_connected->bias()); ++ bcq_fc->weights_clusters(packed_clusters(weights)); ++ bcq_fc->fusedActivationFunction(fully_connected->fusedActivationFunction()); ++ ++ loco::Node *bcq_input = fully_connected->input(); ++ int32_t batch_rank = 0; ++ ++ // If input of BCQFullyConnected has more than rank 2, we should reshape it as rank 2 ++ const auto original_input = loco::must_cast(fully_connected->input()); ++ if (original_input->shape_status() == luci::ShapeStatus::VALID && ++ original_input->rank() > 2) ++ { ++ auto new_shape = g->nodes()->create(); ++ new_shape->dtype(loco::DataType::S32); ++ new_shape->size(2); ++ new_shape->rank(1); ++ new_shape->dim(0) = 2; ++ ++ auto batch_size = 1; ++ for (uint32_t i = 0; i < original_input->rank() - 1; ++i) ++ batch_size *= original_input->dim(i).value(); ++ ++ new_shape->at(0) = batch_size; ++ new_shape->at(1) = ++ original_input->dim(original_input->rank() - 1).value(); ++ new_shape->shape_status(luci::ShapeStatus::VALID); ++ ++ auto reshape = g->nodes()->create(); ++ reshape->tensor(original_input); ++ reshape->shape(new_shape); ++ ++ bcq_input = reshape; ++ batch_rank = original_input->rank() - 2; ++ } ++ ++ // If x_w formation, we should insert Transpose in front and back of BCQFullyConnected ++ if (do_w_x(weights)) ++ { ++ const auto binary_hidden_size = ++ loco::must_cast(fully_connected->input()) ++ ->dim(batch_rank) ++ .value(); ++ bcq_fc->weights_hidden_size(binary_hidden_size); ++ bcq_fc->input(bcq_input); ++ loco::replace(fully_connected).with(bcq_fc); ++ } ++ else ++ { ++ const auto binary_hidden_size = ++ loco::must_cast(fully_connected->input()) ++ ->dim(1 + batch_rank) ++ .value(); ++ bcq_fc->weights_hidden_size(binary_hidden_size); ++ ++ auto perm = g->nodes()->create(); ++ perm->dtype(loco::DataType::S32); ++ perm->size(2); ++ perm->rank(1); ++ perm->dim(0) = 2; ++ perm->at(0) = 1; ++ perm->at(1) = 0; ++ perm->shape_status(luci::ShapeStatus::VALID); ++ ++ auto input_transpose = g->nodes()->create(); ++ input_transpose->a(bcq_input); ++ input_transpose->perm(perm); ++ ++ bcq_fc->input(input_transpose); ++ ++ auto output_transpose = g->nodes()->create(); ++ output_transpose->a(bcq_fc); ++ output_transpose->perm(perm); ++ ++ loco::replace(fully_connected).with(output_transpose); ++ } ++ ++ changed = true; ++ } ++ } ++ } ++ ++ if (changed) ++ clear_BCQ_nodes(); ++ ++ return changed; ++ } ++ ++private: + void add_BCQ_info_node(luci::CircleConst *node) + { + const auto node_name = node->name(); +@@ -119,16 +295,65 @@ public: + return has_info; + } + ++ /** ++ * @brief Exclude BCQ information nodes which are used for fusing BCQ operations ++ * from graph output by using CircleOutputExclude ++ */ ++ void clear_BCQ_nodes() ++ { ++ auto clear_nodes = [](std::map &nodes) { ++ for (auto &n : nodes) ++ { ++ auto node = n.second; ++ ++ for (auto s : loco::succs(node)) ++ { ++ if (auto outnode = dynamic_cast(s)) ++ { ++ outnode->from(createNoOp(node)); ++ } ++ else if (auto reshape_node = dynamic_cast(s)) ++ { ++ for (auto o : loco::succs(reshape_node)) ++ { ++ auto circle_output = loco::must_cast(o); ++ circle_output->from(createNoOp(reshape_node)); ++ } ++ } ++ } ++ } ++ }; ++ ++ clear_nodes(_do_w_x); ++ clear_nodes(_alpha); ++ clear_nodes(_packed_binary_code); ++ clear_nodes(_number_of_clusters); ++ clear_nodes(_size_of_clusters); ++ clear_nodes(_qbits_of_clusters); ++ clear_nodes(_dequant_weight); ++ } ++ ++ bool is_bcqinfo_valid() ++ { ++ // do_w_x should be int32 or bool type ++ for (auto n : _do_w_x) ++ { ++ if (n.second->dtype() != loco::DataType::BOOL && n.second->dtype() != loco::DataType::S32) ++ return false; ++ } ++ ++ return true; ++ } ++ ++private: + bool do_w_x(luci::CircleConst *node) + { + const auto prefix = node_name_prefix(node->name()); + + if (_do_w_x[prefix]->dtype() == loco::DataType::S32) + return _do_w_x[prefix]->at(0) == 1; +- else if (_do_w_x[prefix]->dtype() == loco::DataType::BOOL) +- return _do_w_x[prefix]->at(0); + else +- throw std::runtime_error("do_w_x should be int or bool"); ++ return _do_w_x[prefix]->at(0); + } + + luci::CircleConst *get_alpha(luci::CircleConst *node) +@@ -187,64 +412,6 @@ public: + return packed_clusters; + } + +- /** +- * @brief Exclude BCQ information nodes which are used for fusing BCQ operations +- * from graph output by using CircleOutputExclude +- */ +- void clear_BCQ_nodes() +- { +- auto createNoOp = [](luci::CircleNode *circle_node) { +- auto graph = circle_node->graph(); +- auto noOp = graph->nodes()->create(); +- +- if (circle_node->shape_status() == luci::ShapeStatus::VALID) +- { +- noOp->dtype(circle_node->dtype()); +- noOp->rank(circle_node->rank()); +- for (uint32_t i = 0; i < circle_node->rank(); ++i) +- noOp->dim(i) = circle_node->dim(i); +- } +- else +- { +- // For type inference +- noOp->dtype(loco::DataType::FLOAT32); +- } +- +- return noOp; +- }; +- +- auto clear_nodes = [createNoOp](std::map &nodes) { +- for (auto &n : nodes) +- { +- auto node = n.second; +- +- for (auto s : loco::succs(node)) +- { +- if (auto outnode = dynamic_cast(s)) +- { +- outnode->from(createNoOp(node)); +- } +- else if (auto reshape_node = dynamic_cast(s)) +- { +- for (auto o : loco::succs(reshape_node)) +- { +- auto circle_output = loco::must_cast(o); +- circle_output->from(createNoOp(reshape_node)); +- } +- } +- } +- } +- }; +- +- clear_nodes(_do_w_x); +- clear_nodes(_alpha); +- clear_nodes(_packed_binary_code); +- clear_nodes(_number_of_clusters); +- clear_nodes(_size_of_clusters); +- clear_nodes(_qbits_of_clusters); +- clear_nodes(_dequant_weight); +- } +- + private: + std::map _do_w_x; + std::map _alpha; +@@ -262,142 +429,9 @@ namespace luci + + bool FuseBCQPass::run(loco::Graph *g) + { +- BCQConverter converter; +- + bool changed = false; + +- for (auto node : loco::all_nodes(g)) +- { +- if (auto circle_const = dynamic_cast(node)) +- { +- converter.add_BCQ_info_node(circle_const); +- } +- } +- +- for (auto node : loco::active_nodes(loco::output_nodes(g))) +- { +- if (auto gather = dynamic_cast(node)) +- { +- auto params = dynamic_cast(gather->params()); +- if (params != nullptr && converter.has_BCQ_info(params)) +- { +- auto bcq_gather = g->nodes()->create(); +- +- bcq_gather->input_scales(converter.get_alpha(params)); +- bcq_gather->input_binary(converter.get_packed_binary_code(params)); +- bcq_gather->indices(gather->indices()); +- bcq_gather->input_clusters(converter.packed_clusters(params)); +- +- const auto binary_hidden_size = +- loco::must_cast(bcq_gather->input_binary())->dim(1).value() * 32; +- bcq_gather->input_hidden_size(binary_hidden_size); +- +- if (converter.do_w_x(params)) +- { +- bcq_gather->axis(gather->axis()); +- } +- else +- { +- const auto axis_transpose = (gather->axis() == 0) ? 1 : 0; +- bcq_gather->axis(axis_transpose); +- } +- +- loco::replace(gather).with(bcq_gather); +- +- changed = true; +- } +- } +- else if (auto fully_connected = dynamic_cast(node)) +- { +- auto weights = dynamic_cast(fully_connected->weights()); +- if (weights != nullptr && converter.has_BCQ_info(weights)) +- { +- auto bcq_fc = g->nodes()->create(); +- +- bcq_fc->weights_scales(converter.get_alpha(weights)); +- bcq_fc->weights_binary(converter.get_packed_binary_code(weights)); +- bcq_fc->bias(fully_connected->bias()); +- bcq_fc->weights_clusters(converter.packed_clusters(weights)); +- bcq_fc->fusedActivationFunction(fully_connected->fusedActivationFunction()); +- +- loco::Node *bcq_input = fully_connected->input(); +- int32_t batch_rank = 0; +- +- // If input of BCQFullyConnected has more than rank 2, we should reshape it as rank 2 +- const auto original_input = loco::must_cast(fully_connected->input()); +- if (original_input->shape_status() == ShapeStatus::VALID && original_input->rank() > 2) +- { +- auto new_shape = g->nodes()->create(); +- new_shape->dtype(loco::DataType::S32); +- new_shape->size(2); +- new_shape->rank(1); +- new_shape->dim(0) = 2; +- +- auto batch_size = 1; +- for (uint32_t i = 0; i < original_input->rank() - 1; ++i) +- batch_size *= original_input->dim(i).value(); +- +- new_shape->at(0) = batch_size; +- new_shape->at(1) = +- original_input->dim(original_input->rank() - 1).value(); +- new_shape->shape_status(ShapeStatus::VALID); +- +- auto reshape = g->nodes()->create(); +- reshape->tensor(original_input); +- reshape->shape(new_shape); +- +- bcq_input = reshape; +- batch_rank = original_input->rank() - 2; +- } +- +- // If x_w formation, we should insert Transpose in front and back of BCQFullyConnected +- if (converter.do_w_x(weights)) +- { +- const auto binary_hidden_size = +- loco::must_cast(fully_connected->input()) +- ->dim(batch_rank) +- .value(); +- bcq_fc->weights_hidden_size(binary_hidden_size); +- bcq_fc->input(bcq_input); +- loco::replace(fully_connected).with(bcq_fc); +- } +- else +- { +- const auto binary_hidden_size = +- loco::must_cast(fully_connected->input()) +- ->dim(1 + batch_rank) +- .value(); +- bcq_fc->weights_hidden_size(binary_hidden_size); +- +- auto perm = g->nodes()->create(); +- perm->dtype(loco::DataType::S32); +- perm->size(2); +- perm->rank(1); +- perm->dim(0) = 2; +- perm->at(0) = 1; +- perm->at(1) = 0; +- perm->shape_status(ShapeStatus::VALID); +- +- auto input_transpose = g->nodes()->create(); +- input_transpose->a(bcq_input); +- input_transpose->perm(perm); +- +- bcq_fc->input(input_transpose); +- +- auto output_transpose = g->nodes()->create(); +- output_transpose->a(bcq_fc); +- output_transpose->perm(perm); +- +- loco::replace(fully_connected).with(output_transpose); +- } +- +- changed = true; +- } +- } +- } +- +- if (changed) +- converter.clear_BCQ_nodes(); ++ changed = BCQFuser<1>().fuseBCQ(g); + + return changed; + } +diff --git a/compiler/luci/pass/src/QuantizationUtils.cpp b/compiler/luci/pass/src/QuantizationUtils.cpp +index 6726ce7..9c9e741 100644 +--- a/compiler/luci/pass/src/QuantizationUtils.cpp ++++ b/compiler/luci/pass/src/QuantizationUtils.cpp +@@ -99,6 +99,13 @@ void compute_asym_scale_zp(float min, float max, float &scaling_factor, int64_t + nudged_zero_point = static_cast(std::round(zero_point_double)); + } + ++ // protect scale from being very low due to overflow ++ if (scale < 1e-5) ++ { ++ scale = 1e-5; ++ nudged_zero_point = static_cast(std::round(qmin_double - rmin / scale)); ++ } ++ + nudged_min = static_cast((qmin_double - nudged_zero_point) * scale); + nudged_max = static_cast((qmax_double - nudged_zero_point) * scale); + +diff --git a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp +index f8abee7..2264bd7 100644 +--- a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp ++++ b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp +@@ -138,7 +138,8 @@ bool is_quantized(const CircleNode *node) + node->dtype() == loco::DataType::S32; // bias + } + +-void sym_wquant_per_channel(CircleConst *node, std::vector &scaling_factor) ++void sym_wquant_per_channel(CircleConst *node, std::vector &scaling_factor, ++ int32_t &channel_dim_index) + { + assert(node->dtype() == loco::DataType::FLOAT32); + +@@ -153,7 +154,6 @@ void sym_wquant_per_channel(CircleConst *node, std::vector &scaling_facto + uint32_t indices[4] = { + 0, + }; +- int channel_dim_index{0}; + + if (!get_channel_dim_index(node, dimension, channel_dim_index)) + { +@@ -189,7 +189,7 @@ void sym_wquant_per_channel(CircleConst *node, std::vector &scaling_facto + } + + void asym_wquant_per_channel(CircleConst *node, std::vector &min, +- std::vector &scaling_factor) ++ std::vector &scaling_factor, int32_t &channel_dim_index) + { + assert(node->dtype() == loco::DataType::FLOAT32); + +@@ -204,7 +204,6 @@ void asym_wquant_per_channel(CircleConst *node, std::vector &min, + uint32_t indices[4] = { + 0, + }; +- int channel_dim_index{0}; + + if (!get_channel_dim_index(node, dimension, channel_dim_index)) + { +@@ -350,8 +349,8 @@ struct QuantizeActivation final : public luci::CircleNodeMutableVisitor + circle_node->dtype(loco::DataType::S16); + } + +- circle_node->quantparam()->max[0] = nudged_max; +- circle_node->quantparam()->min[0] = nudged_min; ++ circle_node->quantparam()->min.clear(); ++ circle_node->quantparam()->max.clear(); + circle_node->quantparam()->scale.push_back(scaling_factor); + circle_node->quantparam()->zerop.push_back(zp); + } +@@ -472,15 +471,19 @@ struct QuantizeWeights final : public luci::CircleNodeMutableVisitor + assert(quantparam != nullptr); + auto min = quantparam->min; + auto scaling_factor = quantparam->scale; ++ int32_t channel_dim_index = 0; + + if (output_type == loco::DataType::U8) + { +- asym_wquant_per_channel(circle_const, min, scaling_factor); ++ asym_wquant_per_channel(circle_const, min, scaling_factor, channel_dim_index); + } + else + { +- sym_wquant_per_channel(circle_const, scaling_factor); ++ sym_wquant_per_channel(circle_const, scaling_factor, channel_dim_index); + } ++ quantparam->min.clear(); ++ quantparam->max.clear(); ++ quantparam->quantized_dimension = channel_dim_index; + } + // Find min/max per layer-wise + else +@@ -493,6 +496,8 @@ struct QuantizeWeights final : public luci::CircleNodeMutableVisitor + auto min = quantparam->min[0]; + auto scaling_factor = quantparam->scale[0]; + asym_wquant_per_layer(circle_const, min, scaling_factor); ++ quantparam->min.clear(); ++ quantparam->max.clear(); + } + } + } +diff --git a/compiler/luci/tests/test.lst b/compiler/luci/tests/test.lst +index 188e298..3da3437 100644 +--- a/compiler/luci/tests/test.lst ++++ b/compiler/luci/tests/test.lst +@@ -30,13 +30,16 @@ addread(Ceil_000) + addread(Concatenation_000) + addread(Concatenation_U8_000) + addread(Conv2D_000) ++addread(Conv2D_001) + addread(Conv2D_002) + addread(Conv2D_003) + addread(Conv2D_U8_000) ++addread(Conv2D_U8_001) + addread(Cos_000) + addread(DepthToSpace_000) + addread(DepthwiseConv2D_000) + addread(DepthwiseConv2D_U8_000) ++addread(DepthwiseConv2D_U8_001) + addread(DepthwiseConv2D_001) + addread(Div_000) + addread(ELU_000) +@@ -84,6 +87,7 @@ addread(MaxPool2D_000) + addread(MaxPool2D_U8_000) + addread(Mean_000) + addread(Mean_001) ++addread(Mean_U8_000) + addread(Minimum_000) + addread(MirrorPad_000) + addread(Mul_000) +@@ -97,6 +101,7 @@ addread(OneHot_003) + addread(Pack_000) + addread(Pack_U8_000) + addread(Pad_000) ++addread(Pad_U8_000) + addread(Pow_000) + addread(PRelu_000) + addread(Range_000) +@@ -222,13 +227,16 @@ addwrite(Ceil_000) + addwrite(Concatenation_000) + addwrite(Concatenation_U8_000) + addwrite(Conv2D_000) ++addwrite(Conv2D_001) + addwrite(Conv2D_002) + addwrite(Conv2D_003) + addwrite(Conv2D_U8_000) ++addwrite(Conv2D_U8_001) + addwrite(Cos_000) + addwrite(DepthToSpace_000) + addwrite(DepthwiseConv2D_000) + addwrite(DepthwiseConv2D_U8_000) ++addwrite(DepthwiseConv2D_U8_001) + addwrite(DepthwiseConv2D_001) + addwrite(Div_000) + addwrite(ELU_000) +@@ -276,6 +284,7 @@ addwrite(MaxPool2D_000) + addwrite(MaxPool2D_U8_000) + addwrite(Mean_000) + addwrite(Mean_001) ++addwrite(Mean_U8_000) + addwrite(Minimum_000) + addwrite(MirrorPad_000) + addwrite(Mul_000) +diff --git a/compiler/one-cmds/one-codegen b/compiler/one-cmds/one-codegen +index 2c80664..820b6d8 100644 +--- a/compiler/one-cmds/one-codegen ++++ b/compiler/one-cmds/one-codegen +@@ -18,7 +18,7 @@ DRIVER_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + + function Usage() + { +- echo "Usage: $0 [BACKEND] ..." ++ echo "Usage: one-codegen [BACKEND] ..." + echo "Available BACKEND drivers:" + backend_exist=0 + for file in `find $DRIVER_PATH -name *-compile -type f`; +@@ -33,23 +33,34 @@ function Usage() + if [ $backend_exist == 0 ]; then + echo " (There is no available backend drivers)" + fi ++ ++ exit 255 + } + +-# Get command from command-line +-BACKEND=$1; shift +-BACKEND_DRIVER="$BACKEND-compile" ++function version() ++{ ++ $DRIVER_PATH/one-version one-codegen ++ exit 255 ++} + +-if [[ -z "${BACKEND_DRIVER}" ]]; then ++# Get command from command-line ++BACKEND=$1 ++if [[ -z ${BACKEND} ]]; then + Usage +- exit 255 + fi ++shift ++ ++if [[ "${BACKEND}" == "--version" ]]; then ++ version ++fi ++ ++BACKEND_DRIVER="${BACKEND}-compile" + + BACKEND_DRIVER_CMD="${DRIVER_PATH}/${BACKEND_DRIVER}" + + if [[ ! -f "${BACKEND_DRIVER_CMD}" ]]; then + echo "ERROR: '${BACKEND_DRIVER}' is not supported" + Usage +- exit 255 + fi + + "${BACKEND_DRIVER_CMD}" "$@" +diff --git a/compiler/one-cmds/one-import b/compiler/one-cmds/one-import +index dbf4af5..b1dd8f4 100644 +--- a/compiler/one-cmds/one-import ++++ b/compiler/one-cmds/one-import +@@ -18,7 +18,7 @@ DRIVER_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + + function Usage() + { +- echo "Usage: $0 [FRAMEWORK] ..." ++ echo "Usage: one-import [FRAMEWORK] ..." + echo "Available FRAMEWORK drivers:" + framework_exist=0 + for file in "$DRIVER_PATH"/one-import-*; +@@ -31,23 +31,34 @@ function Usage() + if [ $framework_exist == 0 ]; then + echo " (There is no available import drivers)" + fi ++ ++ exit 255 + } + +-# Get command from command-line +-FRAMEWORK=$1; shift +-FRAMEWORK_DRIVER="one-import-$FRAMEWORK" ++function version() ++{ ++ $DRIVER_PATH/one-version one-import-tf ++ exit 255 ++} + +-if [[ -z "${FRAMEWORK_DRIVER}" ]]; then ++# Get command from command-line ++FRAMEWORK=$1 ++if [[ -z ${FRAMEWORK} ]]; then + Usage +- exit 255 ++fi ++shift ++ ++if [ ${FRAMEWORK} = "--version" ]; then ++ version + fi + ++FRAMEWORK_DRIVER="one-import-$FRAMEWORK" ++ + FRAMEWORK_DRIVER_CMD="${DRIVER_PATH}/${FRAMEWORK_DRIVER}" + + if [[ ! -f "${FRAMEWORK_DRIVER_CMD}" ]]; then + echo "ERROR: '${FRAMEWORK_DRIVER}' is not supported" + Usage +- exit 255 + fi + + "${FRAMEWORK_DRIVER_CMD}" "$@" +diff --git a/compiler/one-cmds/one-import-tf b/compiler/one-cmds/one-import-tf +index c048a4e..d59e1c5 100644 +--- a/compiler/one-cmds/one-import-tf ++++ b/compiler/one-cmds/one-import-tf +@@ -22,14 +22,24 @@ usage() + { + echo "Convert TensorFlow model to circle." + echo "Usage: one-import-tf" ++ echo " --version Show version information and exit" + echo " --input_path " + echo " --output_path " + echo " --input_arrays " + echo " --input_shapes " + echo " --output_arrays " +- exit 0 ++ echo " --v2 Use TensorFlow 2.x interface (default is 1.x interface)" ++ exit 255 + } + ++version() ++{ ++ $DRIVER_PATH/one-version one-import-tf ++ exit 255 ++} ++ ++TF_INTERFACE="--v1" ++ + # Parse command-line arguments + # + while [ "$#" -ne 0 ]; do +@@ -39,6 +49,9 @@ while [ "$#" -ne 0 ]; do + '--help') + usage + ;; ++ '--version') ++ version ++ ;; + '--input_path') + export INPUT_PATH="$2" + shift 2 +@@ -59,6 +72,10 @@ while [ "$#" -ne 0 ]; do + export OUTPUT_ARRAYS="$2" + shift 2 + ;; ++ '--v2') ++ TF_INTERFACE="--v2" ++ shift ++ ;; + *) + echo "Unknown parameter: ${CUR}" + shift +@@ -92,14 +109,21 @@ fi + # remove previous log + rm -rf "${OUTPUT_PATH}.log" + ++show_err_onexit() ++{ ++ cat "${OUTPUT_PATH}.log" ++} ++ ++trap show_err_onexit ERR ++ + # generate temporary tflite file +-echo "python" "${DRIVER_PATH}/tf2tfliteV2.py" --v2 --input_path ${INPUT_PATH} \ ++echo "python" "${DRIVER_PATH}/tf2tfliteV2.py" ${TF_INTERFACE} --input_path ${INPUT_PATH} \ + --input_arrays ${INPUT_ARRAYS} --input_shapes ${INPUT_SHAPES} \ + --output_path "${TMPDIR}/${MODEL_NAME}.tflite" \ + --output_arrays ${OUTPUT_ARRAYS} > "${OUTPUT_PATH}.log" + echo " " >> "${OUTPUT_PATH}.log" + +-python "${DRIVER_PATH}/tf2tfliteV2.py" --v2 --input_path ${INPUT_PATH} \ ++python "${DRIVER_PATH}/tf2tfliteV2.py" ${TF_INTERFACE} --input_path ${INPUT_PATH} \ + --input_arrays ${INPUT_ARRAYS} --input_shapes ${INPUT_SHAPES} \ + --output_path "${TMPDIR}/${MODEL_NAME}.tflite" \ + --output_arrays ${OUTPUT_ARRAYS} >> "${OUTPUT_PATH}.log" 2>&1 +diff --git a/compiler/one-cmds/one-import-tflite b/compiler/one-cmds/one-import-tflite +index 31ed5af..053489c 100644 +--- a/compiler/one-cmds/one-import-tflite ++++ b/compiler/one-cmds/one-import-tflite +@@ -22,9 +22,16 @@ usage() + { + echo "Convert TensorFlow lite model to circle." + echo "Usage: one-import-tflite" ++ echo " --version Show version information and exit" + echo " --input_path " + echo " --output_path " +- exit 0 ++ exit 255 ++} ++ ++version() ++{ ++ $DRIVER_PATH/one-version one-import-tflite ++ exit 255 + } + + # Parse command-line arguments +@@ -36,6 +43,9 @@ while [ "$#" -ne 0 ]; do + '--help') + usage + ;; ++ '--version') ++ version ++ ;; + '--input_path') + export INPUT_PATH="$2" + shift 2 +@@ -55,12 +65,18 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then + echo "Error: input model not found" + echo "" + usage +- exit 2 + fi + + # remove previous log + rm -rf "${OUTPUT_PATH}.log" + ++show_err_onexit() ++{ ++ cat "${OUTPUT_PATH}.log" ++} ++ ++trap show_err_onexit ERR ++ + # convert .tflite to .circle + echo "${DRIVER_PATH}/tflite2circle" "${INPUT_PATH}" "${OUTPUT_PATH}" > "${OUTPUT_PATH}.log" + +diff --git a/compiler/one-cmds/one-optimize b/compiler/one-cmds/one-optimize +index 95384c1..17b6b98 100644 +--- a/compiler/one-cmds/one-optimize ++++ b/compiler/one-cmds/one-optimize +@@ -22,6 +22,7 @@ usage() + { + echo "Optimize circle model." + echo "Usage: one-optimize" ++ echo " --version Show version information and exit" + echo " --all Enable all optimization algorithms" + echo " --fuse_bcq Enable FuseBCQ Pass" + echo " --fuse_instnorm Enable FuseInstanceNormalization Pass" +@@ -33,7 +34,13 @@ usage() + echo " Enable ResolveCustomOpMatMulPass Pass" + echo " --input_path " + echo " --output_path " +- exit 0 ++ exit 255 ++} ++ ++version() ++{ ++ $DRIVER_PATH/one-version one-optimize ++ exit 255 + } + + OPTIMIZE_all=0 +@@ -52,6 +59,9 @@ while [ "$#" -ne 0 ]; do + '--help') + usage + ;; ++ '--version') ++ version ++ ;; + '--all') + OPTIMIZE_all=1 + shift +@@ -96,7 +106,6 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then + echo "Error: input model not found" + echo "" + usage +- exit 2 + fi + + OPTIMIZE_OPTIONS="" +@@ -123,6 +132,13 @@ fi + # remove previous log + rm -rf "${OUTPUT_PATH}.log" + ++show_err_onexit() ++{ ++ cat "${OUTPUT_PATH}.log" ++} ++ ++trap show_err_onexit ERR ++ + # NOTE do not wrap ${OPTIMIZE_OPTIONS} with "" + # optimize circle + echo "${DRIVER_PATH}/circle2circle" ${OPTIMIZE_OPTIONS} \ +diff --git a/compiler/one-cmds/one-pack b/compiler/one-cmds/one-pack +index 2bc4c60..9224b2c 100644 +--- a/compiler/one-cmds/one-pack ++++ b/compiler/one-cmds/one-pack +@@ -22,9 +22,16 @@ usage() + { + echo "Package circle to nnpkg" + echo "Usage: one-pack" ++ echo " -v, --version Show version information and exit" + echo " -i " + echo " -o " +- exit 0 ++ exit 255 ++} ++ ++version() ++{ ++ $DRIVER_PATH/one-version one-pack ++ exit 255 + } + + # Parse command-line arguments +@@ -36,6 +43,12 @@ while [ "$#" -ne 0 ]; do + '--help') + usage + ;; ++ '-v') ++ version ++ ;; ++ '--version') ++ version ++ ;; + '-i') + export INPUT_PATH="$2" + shift 2 +@@ -55,12 +68,18 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then + echo "Error: input model not found" + echo "" + usage +- exit 2 + fi + + # remove previous log + rm -rf "${OUTPUT_PATH}.log" + ++show_err_onexit() ++{ ++ cat "${OUTPUT_PATH}.log" ++} ++ ++trap show_err_onexit ERR ++ + # Package circle model file to nnpkg + echo "${DRIVER_PATH}/model2nnpkg.sh" -o "${OUTPUT_PATH}" "${INPUT_PATH}" > "${OUTPUT_PATH}.log" + +diff --git a/compiler/one-cmds/one-quantize b/compiler/one-cmds/one-quantize +index ff9e266..c74b2c2 100644 +--- a/compiler/one-cmds/one-quantize ++++ b/compiler/one-cmds/one-quantize +@@ -22,16 +22,23 @@ usage() + { + echo "Quantize circle model." + echo "Usage: one-quantize" ++ echo " --version Show version information and exit" + echo " --input_dtype Input data type (supported: float32, default=float32)" + echo " --quantized_dtype Output quantized data type (supported: uint8, default=uint8)" +- echo " --granularity Quantize granularity (supported: layer, default=layer)" ++ echo " --granularity Quantize granularity (supported: layer, channel, default=layer)" + echo " --min_percentile Minimum percentile (0.0~100.0, default=1.0)" + echo " --max_percentile Maximum percentile (0.0~100.0, default=99.0)" + echo " --mode Record mode (supported: percentile/moving_average, default=percentile)" + echo " --input_path " + echo " --input_data " + echo " --output_path " +- exit 0 ++ exit 255 ++} ++ ++version() ++{ ++ $DRIVER_PATH/one-version one-quantize ++ exit 255 + } + + INPUT_DTYPE=float32 +@@ -50,6 +57,9 @@ while [ "$#" -ne 0 ]; do + '--help') + usage + ;; ++ '--version') ++ version ++ ;; + + '--input_dtype') + INPUT_DTYPE="$2" +@@ -100,13 +110,11 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then + echo "Error: input model not found" + echo "" + usage +- exit 2 + fi + if [ -z ${INPUT_DATA} ] || [ ! -e ${INPUT_DATA} ]; then + echo "Error: input data not found" + echo "" + usage +- exit 2 + fi + + FILE_BASE=$(basename ${OUTPUT_PATH}) +@@ -118,6 +126,13 @@ trap "{ rm -rf $TMPDIR; }" EXIT + # remove previous log + rm -rf "${OUTPUT_PATH}.log" + ++show_err_onexit() ++{ ++ cat "${OUTPUT_PATH}.log" ++} ++ ++trap show_err_onexit ERR ++ + # quantize circle + echo "${DRIVER_PATH}/circle-quantizer" \ + --quantize_dequantize_weights ${INPUT_DTYPE} ${QUANTIZED_DTYPE} ${GRANULARITY} \ +diff --git a/compiler/one-cmds/requires.cmake b/compiler/one-cmds/requires.cmake +index 9b858ad..812149c 100644 +--- a/compiler/one-cmds/requires.cmake ++++ b/compiler/one-cmds/requires.cmake +@@ -3,3 +3,4 @@ require("tflite2circle") + require("circle2circle") + require("circle-quantizer") + require("record-minmax") ++require("vconone") +diff --git a/compiler/record-minmax/CMakeLists.txt b/compiler/record-minmax/CMakeLists.txt +index 862660e..f8a165b 100644 +--- a/compiler/record-minmax/CMakeLists.txt ++++ b/compiler/record-minmax/CMakeLists.txt +@@ -19,9 +19,14 @@ target_link_libraries(record-minmax safemain) + target_link_libraries(record-minmax luci_import) + target_link_libraries(record-minmax luci_export) + target_link_libraries(record-minmax luci_interpreter) ++target_link_libraries(record-minmax vconone) + + install(TARGETS record-minmax DESTINATION bin) + ++if(NOT ENABLE_TEST) ++ return() ++endif(NOT ENABLE_TEST) ++ + nnas_find_package(GTest REQUIRED) + GTest_AddTest(record_minmax_function_test "${CMAKE_CURRENT_SOURCE_DIR}/tests/RecordFunction.test.cpp") + target_include_directories(record_minmax_function_test PRIVATE include) +diff --git a/compiler/record-minmax/driver/Driver.cpp b/compiler/record-minmax/driver/Driver.cpp +index ae4fcb7..8b09498 100644 +--- a/compiler/record-minmax/driver/Driver.cpp ++++ b/compiler/record-minmax/driver/Driver.cpp +@@ -17,6 +17,13 @@ + #include "RecordMinMax.h" + + #include ++#include ++ ++void print_version(void) ++{ ++ std::cout << "record-minmax version " << vconone::get_string() << std::endl; ++ std::cout << vconone::get_copyright() << std::endl; ++} + + int entry(const int argc, char **argv) + { +@@ -25,6 +32,13 @@ int entry(const int argc, char **argv) + arser::Arser arser( + "Embedding min/max values of activations to the circle model for post-training quantization"); + ++ arser.add_argument("--version") ++ .nargs(0) ++ .required(false) ++ .default_value(false) ++ .help("Show version information and exit") ++ .exit_with(print_version); ++ + arser.add_argument("--input_model") + .nargs(1) + .type(arser::DataType::STR) +@@ -66,7 +80,7 @@ int entry(const int argc, char **argv) + { + std::cout << err.what() << std::endl; + std::cout << arser; +- return 0; ++ return 255; + } + + auto input_model_path = arser.get("--input_model"); +diff --git a/compiler/record-minmax/requires.cmake b/compiler/record-minmax/requires.cmake +index 0545035..f6804ce 100644 +--- a/compiler/record-minmax/requires.cmake ++++ b/compiler/record-minmax/requires.cmake +@@ -1,3 +1,4 @@ + require("luci") + require("safemain") + require("arser") ++require("vconone") +diff --git a/compiler/record-minmax/src/HDF5Importer.cpp b/compiler/record-minmax/src/HDF5Importer.cpp +index cf30cd8..a0e65ee 100644 +--- a/compiler/record-minmax/src/HDF5Importer.cpp ++++ b/compiler/record-minmax/src/HDF5Importer.cpp +@@ -20,6 +20,7 @@ + + #include + #include ++#include + + using Shape = luci_interpreter::Shape; + using DataType = luci_interpreter::DataType; +diff --git a/compiler/record-minmax/src/MinMaxObserver.cpp b/compiler/record-minmax/src/MinMaxObserver.cpp +index 45f0197..410ce3d 100644 +--- a/compiler/record-minmax/src/MinMaxObserver.cpp ++++ b/compiler/record-minmax/src/MinMaxObserver.cpp +@@ -38,7 +38,8 @@ void MinMaxObserver::postTensorWrite(const luci::CircleNode *node, + assert(node->opcode() != luci::CircleOpcode::UNPACK); + assert(node->opcode() != luci::CircleOpcode::WHILE); + +- if (node->opcode() == luci::CircleOpcode::CONST) ++ if (node->opcode() == luci::CircleOpcode::CONST || ++ node->opcode() == luci::CircleOpcode::CIRCLECONST) + { + // node is not activation. Do nothing. + return; +diff --git a/compiler/record-minmax/src/RecordMinMax.cpp b/compiler/record-minmax/src/RecordMinMax.cpp +index d12a0d3..17c6aa6 100644 +--- a/compiler/record-minmax/src/RecordMinMax.cpp ++++ b/compiler/record-minmax/src/RecordMinMax.cpp +@@ -158,7 +158,7 @@ void RecordMinMax::profileData(const std::string &mode, const std::string &input + auto node = iter->first; + auto minmax = iter->second; + +- float min, max; ++ float min{0.0f}, max{0.0f}; + if (mode == "percentile") + { + min = getNthPercentile(minmax.min_vector, min_percentile); +diff --git a/compiler/record-minmax/tests/RecordFunction.test.cpp b/compiler/record-minmax/tests/RecordFunction.test.cpp +index 13b464d..e2f135a 100644 +--- a/compiler/record-minmax/tests/RecordFunction.test.cpp ++++ b/compiler/record-minmax/tests/RecordFunction.test.cpp +@@ -32,6 +32,8 @@ TEST(GetNthPercentileTest, Edge) + + EXPECT_FLOAT_NEAR(0, getNthPercentile(input, 0)); + EXPECT_FLOAT_NEAR(9, getNthPercentile(input, 100)); ++ ++ SUCCEED(); + } + + TEST(GetNthPercentileTest, Simple) +@@ -47,6 +49,8 @@ TEST(GetNthPercentileTest, Simple) + { + EXPECT_FLOAT_NEAR(0.09 * std::floor(i) + 0.045, getNthPercentile(input, i)); + } ++ ++ SUCCEED(); + } + + TEST(GetNthPercentileTest, Float) +@@ -61,6 +65,8 @@ TEST(GetNthPercentileTest, Float) + EXPECT_FLOAT_NEAR(2.799942346802177, getNthPercentile(input, 1)); + EXPECT_FLOAT_NEAR(7.768503955476342, getNthPercentile(input, 3.14)); + EXPECT_FLOAT_NEAR(99.40456084968194, getNthPercentile(input, 99)); ++ ++ SUCCEED(); + } + + TEST(GetNthPercentileTest, FloatWithNegative) +@@ -75,6 +81,8 @@ TEST(GetNthPercentileTest, FloatWithNegative) + EXPECT_FLOAT_NEAR(-47.20005765319782, getNthPercentile(input, 1)); + EXPECT_FLOAT_NEAR(-42.23149604452366, getNthPercentile(input, 3.14)); + EXPECT_FLOAT_NEAR(49.40456084968194, getNthPercentile(input, 99)); ++ ++ SUCCEED(); + } + + TEST(GetNthPercentileTest, SigleElement) +@@ -84,6 +92,8 @@ TEST(GetNthPercentileTest, SigleElement) + EXPECT_FLOAT_NEAR(33, getNthPercentile(input, 0)); + EXPECT_FLOAT_NEAR(33, getNthPercentile(input, 50)); + EXPECT_FLOAT_NEAR(33, getNthPercentile(input, 100)); ++ ++ SUCCEED(); + } + + TEST(GetNthPercentileTest, OutOfBoundary_NEG) +@@ -92,6 +102,8 @@ TEST(GetNthPercentileTest, OutOfBoundary_NEG) + + EXPECT_THROW(getNthPercentile(input, -1), std::runtime_error); + EXPECT_THROW(getNthPercentile(input, 101), std::runtime_error); ++ ++ SUCCEED(); + } + + TEST(GetNthPercentileTest, EmptyVector_NEG) +@@ -99,6 +111,8 @@ TEST(GetNthPercentileTest, EmptyVector_NEG) + std::vector input; + + EXPECT_THROW(getNthPercentile(input, 10), std::runtime_error); ++ ++ SUCCEED(); + } + + } // namespace record_minmax +diff --git a/compiler/tfl-verify/CMakeLists.txt b/compiler/tfl-verify/CMakeLists.txt +index d33059f..4421a46 100644 +--- a/compiler/tfl-verify/CMakeLists.txt ++++ b/compiler/tfl-verify/CMakeLists.txt +@@ -6,6 +6,7 @@ file(GLOB_RECURSE SOURCES "src/*.cpp") + + add_executable(tfl-verify ${SOURCES}) + target_include_directories(tfl-verify PRIVATE src) ++target_link_libraries(tfl-verify arser) + target_link_libraries(tfl-verify foder) + target_link_libraries(tfl-verify mio_tflite) + target_link_libraries(tfl-verify safemain) +diff --git a/compiler/tfl-verify/requires.cmake b/compiler/tfl-verify/requires.cmake +index ed6b84d..79503f3 100644 +--- a/compiler/tfl-verify/requires.cmake ++++ b/compiler/tfl-verify/requires.cmake +@@ -1,3 +1,4 @@ ++require("arser") + require("foder") + require("mio-tflite") + require("safemain") +diff --git a/compiler/tfl-verify/src/Driver.cpp b/compiler/tfl-verify/src/Driver.cpp +index 81f6d54..6d18976 100644 +--- a/compiler/tfl-verify/src/Driver.cpp ++++ b/compiler/tfl-verify/src/Driver.cpp +@@ -16,22 +16,31 @@ + + #include "VerifyFlatBuffers.h" + ++#include ++ + #include + #include + #include + + int entry(int argc, char **argv) + { +- if (argc != 2) ++ arser::Arser arser; ++ arser.add_argument("tflite").type(arser::DataType::STR).help("TFLite file path to verify"); ++ ++ try + { +- std::cerr << "ERROR: Failed to parse arguments" << std::endl; +- std::cerr << std::endl; +- std::cerr << "USAGE: " << argv[0] << " [tflite]" << std::endl; ++ arser.parse(argc, argv); ++ } ++ catch (const std::runtime_error &err) ++ { ++ std::cout << err.what() << std::endl; ++ std::cout << arser; + return 255; + } ++ + auto verifier = std::make_unique(); + +- std::string model_file = argv[argc - 1]; ++ std::string model_file = arser.get("tflite"); + + std::cout << "[ RUN ] Check " << model_file << std::endl; + +diff --git a/compiler/tflchef/core/src/ModelChef.cpp b/compiler/tflchef/core/src/ModelChef.cpp +index 932a649..692ce48 100644 +--- a/compiler/tflchef/core/src/ModelChef.cpp ++++ b/compiler/tflchef/core/src/ModelChef.cpp +@@ -413,6 +413,7 @@ template void cook_graph(const T &graph, CookParams &cp) + quant_builder.add_min(quant_min); + quant_builder.add_scale(quant_scale); + quant_builder.add_zero_point(quant_zero_point); ++ quant_builder.add_quantized_dimension(quant.quantized_dimension()); + + // Update QuantizationParameters Index + quant_index = quant_builder.Finish(); +diff --git a/compiler/tflchef/proto/tflchef.proto b/compiler/tflchef/proto/tflchef.proto +index 792503b..55785c3 100644 +--- a/compiler/tflchef/proto/tflchef.proto ++++ b/compiler/tflchef/proto/tflchef.proto +@@ -35,6 +35,7 @@ message TensorQuantization { + repeated float max = 2; + repeated float scale = 3; + repeated int64 zero_point = 4; ++ optional int32 quantized_dimension = 5 [default = 0]; + } + + message Operand { +diff --git a/compiler/tflchef/tflite/src/RecipeChef.cpp b/compiler/tflchef/tflite/src/RecipeChef.cpp +index db62d0e..088961c 100644 +--- a/compiler/tflchef/tflite/src/RecipeChef.cpp ++++ b/compiler/tflchef/tflite/src/RecipeChef.cpp +@@ -184,6 +184,8 @@ std::unique_ptr generate_recipe(const tflite::Model *model) + for (uint32_t idx = 0; idx < quant->zero_point()->size(); ++idx) + chef_quant->add_zero_point(quant->zero_point()->Get(idx)); + } ++ tflchef::TensorQuantization *chef_quant = operand->mutable_quant(); ++ chef_quant->set_quantized_dimension(quant->quantized_dimension()); + } + } + +diff --git a/compiler/tflchef/tools/file/Driver.cpp b/compiler/tflchef/tools/file/Driver.cpp +index cecfeeb..46e5b55 100644 +--- a/compiler/tflchef/tools/file/Driver.cpp ++++ b/compiler/tflchef/tools/file/Driver.cpp +@@ -41,7 +41,7 @@ int entry(int argc, char **argv) + { + std::cout << err.what() << std::endl; + std::cout << arser; +- return 0; ++ return 255; + } + + int32_t model_version = 1; +diff --git a/compiler/tflchef/tools/reverse/Driver.cpp b/compiler/tflchef/tools/reverse/Driver.cpp +index 1116dec..4d795a3 100644 +--- a/compiler/tflchef/tools/reverse/Driver.cpp ++++ b/compiler/tflchef/tools/reverse/Driver.cpp +@@ -38,7 +38,7 @@ int entry(int argc, char **argv) + { + std::cout << err.what() << std::endl; + std::cout << arser; +- return 0; ++ return 255; + } + + std::string tflite_path = arser.get("tflite"); +diff --git a/compiler/tfldump/driver/Driver.cpp b/compiler/tfldump/driver/Driver.cpp +index 3961d2f..38c9c06 100644 +--- a/compiler/tfldump/driver/Driver.cpp ++++ b/compiler/tfldump/driver/Driver.cpp +@@ -33,7 +33,7 @@ int entry(int argc, char **argv) + { + std::cout << err.what() << '\n'; + std::cout << arser; +- return 0; ++ return 255; + } + + std::string tflite_path = arser.get("tflite"); +diff --git a/compiler/tflite2circle/CMakeLists.txt b/compiler/tflite2circle/CMakeLists.txt +index a0a2e02..b1d1f61 100644 +--- a/compiler/tflite2circle/CMakeLists.txt ++++ b/compiler/tflite2circle/CMakeLists.txt +@@ -14,5 +14,6 @@ target_link_libraries(tflite2circle arser) + target_link_libraries(tflite2circle safemain) + target_link_libraries(tflite2circle mio_tflite) + target_link_libraries(tflite2circle mio_circle) ++target_link_libraries(tflite2circle vconone) + + install(TARGETS tflite2circle DESTINATION bin) +diff --git a/compiler/tflite2circle/driver/Driver.cpp b/compiler/tflite2circle/driver/Driver.cpp +index 67b8e33..2f11e0a 100644 +--- a/compiler/tflite2circle/driver/Driver.cpp ++++ b/compiler/tflite2circle/driver/Driver.cpp +@@ -24,10 +24,25 @@ + #include "CircleModel.h" + #include "TFLModel.h" + ++#include ++ ++void print_version(void) ++{ ++ std::cout << "tflite2circle version " << vconone::get_string() << std::endl; ++ std::cout << vconone::get_copyright() << std::endl; ++} ++ + int entry(int argc, char **argv) + { + arser::Arser arser{"tflite2circle is a Tensorflow lite to circle model converter"}; + ++ arser.add_argument("--version") ++ .nargs(0) ++ .required(false) ++ .default_value(false) ++ .help("Show version information and exit") ++ .exit_with(print_version); ++ + arser.add_argument("tflite") + .nargs(1) + .type(arser::DataType::STR) +@@ -42,7 +57,7 @@ int entry(int argc, char **argv) + { + std::cout << err.what() << std::endl; + std::cout << arser; +- return 0; ++ return 255; + } + + std::string tfl_path = arser.get("tflite"); +diff --git a/compiler/tflite2circle/requires.cmake b/compiler/tflite2circle/requires.cmake +index ff19b74..837c287 100644 +--- a/compiler/tflite2circle/requires.cmake ++++ b/compiler/tflite2circle/requires.cmake +@@ -2,3 +2,4 @@ require("arser") + require("mio-tflite") + require("mio-circle") + require("safemain") ++require("vconone") +diff --git a/compiler/vconone/CMakeLists.txt b/compiler/vconone/CMakeLists.txt +new file mode 100644 +index 0000000..b8cb793 +--- /dev/null ++++ b/compiler/vconone/CMakeLists.txt +@@ -0,0 +1,31 @@ ++if (NOT VCONONE_VERSION) ++ set(VCONONE_VERSION 0x0000000000080001) ++ # NOTE order is [build patch minor major] ++ # if VCONONE_VERSION is set with -D option, it will be cached ++ # you may have to remove cache file if you remove -D option ++endif() ++ ++configure_file(version_cfg.h.in version_cfg.h @ONLY) ++ ++set(DRIVER "driver/driver.cpp") ++ ++file(GLOB_RECURSE SOURCES "src/*.cpp") ++file(GLOB_RECURSE TESTS "src/*.test.cpp") ++list(REMOVE_ITEM SOURCES ${TESTS}) ++ ++add_library(vconone STATIC ${SOURCES}) ++target_include_directories(vconone PUBLIC include) ++target_include_directories(vconone PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) ++ ++add_executable(one-version ${DRIVER}) ++target_link_libraries(one-version vconone) ++install(TARGETS one-version DESTINATION bin) ++ ++if(NOT ENABLE_TEST) ++ return() ++endif(NOT ENABLE_TEST) ++ ++nnas_find_package(GTest REQUIRED) ++ ++GTest_AddTest(vconone_test ${TESTS}) ++target_link_libraries(vconone_test vconone) +diff --git a/compiler/vconone/README.md b/compiler/vconone/README.md +new file mode 100644 +index 0000000..c08dd63 +--- /dev/null ++++ b/compiler/vconone/README.md +@@ -0,0 +1,14 @@ ++# vconone ++ ++_vconone_ provides version number and strings for one-* commands and command ++line tools ++ ++# Revise version number ++ ++To revise version number, update `VCONONE_VERSION` in `CmakeLists.txt` ++or give `-DVCONONE_VERSION=0x0000000100080001` at cmake configure step. ++ ++Number given is four numbers `build`, `patch`, `minor` and `major` in order for ++each 16bit integers. `build` is not used for now. ++ ++`0x0000000100080001` version is interpretered as `1.8.1` +diff --git a/compiler/vconone/driver/driver.cpp b/compiler/vconone/driver/driver.cpp +new file mode 100644 +index 0000000..12bd0ee +--- /dev/null ++++ b/compiler/vconone/driver/driver.cpp +@@ -0,0 +1,36 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#include ++ ++#include ++#include ++ ++int main(int argc, char *argv[]) ++{ ++ auto str = vconone::get_string(); ++ if (argc >= 2) ++ { ++ for (int c = 1; c < argc; ++c) ++ std::cout << argv[c] << " "; ++ std::cout << "version " << str << std::endl; ++ std::cout << vconone::get_copyright() << std::endl; ++ } ++ else ++ std::cout << str; ++ ++ return 0; ++} +diff --git a/compiler/vconone/include/vconone/vconone.h b/compiler/vconone/include/vconone/vconone.h +new file mode 100644 +index 0000000..a6a1998 +--- /dev/null ++++ b/compiler/vconone/include/vconone/vconone.h +@@ -0,0 +1,61 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#ifndef __VCON_ONE_H__ ++#define __VCON_ONE_H__ ++ ++#include ++#include ++ ++namespace vconone ++{ ++ ++struct four ++{ ++ uint16_t major; ++ uint16_t minor; ++ uint16_t patch; ++ uint16_t build; // build is not used for now ++}; ++ ++union version { ++ uint64_t v; ++ four f; ++}; ++ ++/** ++ * @brief get_number will return version union structure ++ */ ++version get_number(void); ++ ++/** ++ * @brief get_string will return string of major.minor.patch (without build) ++ */ ++std::string get_string(void); ++ ++/** ++ * @brief get_string4 will return string of major.minor.patch.build ++ */ ++std::string get_string4(void); ++ ++/** ++ * @brief get_copyright will return copyright string ++ */ ++std::string get_copyright(void); ++ ++} // namespace vconone ++ ++#endif // __VCON_ONE_H__ +diff --git a/compiler/vconone/src/version.cpp b/compiler/vconone/src/version.cpp +new file mode 100644 +index 0000000..9b693c6 +--- /dev/null ++++ b/compiler/vconone/src/version.cpp +@@ -0,0 +1,63 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#include "vconone/vconone.h" ++ ++#include "version_cfg.h" ++ ++#include ++ ++namespace vconone ++{ ++ ++version get_number(void) ++{ ++ version v; ++ v.v = VCONONE_VERSION; ++ return v; ++} ++ ++std::string get_string4(void) ++{ ++ std::ostringstream ss; ++ ++ auto v = get_number(); ++ ss << unsigned(v.f.major) << "." << unsigned(v.f.minor) << "." << unsigned(v.f.patch) << "." ++ << unsigned(v.f.build); ++ ++ return ss.str(); ++} ++ ++std::string get_string(void) ++{ ++ std::ostringstream ss; ++ ++ auto v = get_number(); ++ ss << unsigned(v.f.major) << "." << unsigned(v.f.minor) << "." << unsigned(v.f.patch); ++ ++ return ss.str(); ++} ++ ++std::string get_copyright(void) ++{ ++ std::string str; ++ str = "Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved\r\n"; ++ str += "Licensed under the Apache License, Version 2.0\r\n"; ++ str += "https://github.com/Samsung/ONE"; ++ return str; ++} ++ ++} // namespace vconone +diff --git a/compiler/vconone/src/version.test.cpp b/compiler/vconone/src/version.test.cpp +new file mode 100644 +index 0000000..35a0647 +--- /dev/null ++++ b/compiler/vconone/src/version.test.cpp +@@ -0,0 +1,49 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#include ++ ++#include ++ ++TEST(vconone, version_number) ++{ ++ auto v = vconone::get_number(); ++ ++ ASSERT_NE(0x0000000000000000ULL, v.v); ++} ++ ++TEST(vconone, version_string) ++{ ++ auto str = vconone::get_string(); ++ ++ ASSERT_NE("..", str); ++ ASSERT_NE("", str); ++} ++ ++TEST(vconone, version_string4) ++{ ++ auto str = vconone::get_string4(); ++ ++ ASSERT_NE("...", str); ++ ASSERT_NE("", str); ++} ++ ++TEST(vconone, copyright) ++{ ++ auto str = vconone::get_copyright(); ++ ++ ASSERT_NE("", str); ++} +diff --git a/compiler/vconone/version_cfg.h.in b/compiler/vconone/version_cfg.h.in +new file mode 100644 +index 0000000..aa3ad9e +--- /dev/null ++++ b/compiler/vconone/version_cfg.h.in +@@ -0,0 +1,22 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#ifndef __VCON_ONE_VERSION_CFG_H__ ++#define __VCON_ONE_VERSION_CFG_H__ ++ ++#define VCONONE_VERSION @VCONONE_VERSION@ULL ++ ++#endif // __VCON_ONE_VERSION_CFG_H__ +diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h +deleted file mode 100644 +index 9699b5c..0000000 +--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h ++++ /dev/null +@@ -1,124 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-/** +- * @file CLArgOperationKernel.h +- * @brief This file defines CLArgOperationKernel +- * @ingroup COM_AI_RUNTIME +- */ +- +-#ifndef __ARM_COMPUTE_CLARGOPERATIONKERNEL_H__ +-#define __ARM_COMPUTE_CLARGOPERATIONKERNEL_H__ +- +-#include "arm_compute/core/CL/ICLKernel.h" +-#include "arm_compute/core/TypesEx.h" +- +-namespace arm_compute +-{ +-class ICLTensor; +- +-/** +- * @brief Class to define interface for the argop kernel. +- */ +-class CLArgOperationKernel : public ICLKernel +-{ +-public: +- /** +- * @brief Default constructor. +- */ +- CLArgOperationKernel(); +- /** +- * @brief Prevent instances of this class from being copied (As this class contains pointers). +- * @param [in] copiedInstance Const reference of CLArgOperationKernel to be copied +- */ +- CLArgOperationKernel(const CLArgOperationKernel &) = delete; +- /** +- * @brief Prevent instances of this class from being copied (As this class contains pointers). +- * @param [in] copiedInstance Const reference of CLArgOperationKernel to be copied +- * @return Reference of this instance +- */ +- CLArgOperationKernel &operator=(const CLArgOperationKernel &) = delete; +- /** +- * @brief Allow instances of this class to be moved +- * @param [in] movedInstance Rvalue reference of CLArgOperationKernel to be moved +- */ +- CLArgOperationKernel(CLArgOperationKernel &&) = default; +- /** +- * @brief Allow instances of this class to be moved +- * @param [in] movedInstance Rvalue reference of CLArgOperationKernel to be moved +- * @return Reference of this instance +- */ +- CLArgOperationKernel &operator=(CLArgOperationKernel &&) = default; +- /** +- * @brief Initialise the kernel's input, output and border mode. +- * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32. +- * @param[out] output The output tensor, Data types supported: S32. +- * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. +- * @param[in] op Arg operation to perform. +- * return N/A +- */ +- void configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis, ArgOperation op); +- /** +- * @brief Static function to check if given info will lead to a valid configuration of @ref +- * CLArgOperationKernel +- * @param[in] input An input tensor info. Data types supported: U8/QASYMM8/S32/F32. +- * @param[in] output The output tensor info, Data types supported: S32. +- * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. +- * @param[in] op Arg operation to perform. +- * @return a status +- */ +- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, +- ArgOperation op); +- +- /* +- * @brief Run CLArgOperationKernel op +- * @param[in] window Window to be used for in_slice +- * @param[in] queue cl::CommandQueue +- * @return N/A +- */ +- void run(const Window &window, cl::CommandQueue &queue) override; +- +-private: +- const ICLTensor *_input; +- ICLTensor *_output; +- uint32_t _axis; +-}; +-} // namespace arm_compute +-#endif /*__ARM_COMPUTE_CLARGOPERATIONKERNEL_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h +deleted file mode 100644 +index b0357fe..0000000 +--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h ++++ /dev/null +@@ -1,121 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-/** +- * @file CLCastKernel.h +- * @ingroup COM_AI_RUNTIME +- * @brief This file defines CLCastKernel class +- */ +- +-#ifndef __ARM_COMPUTE_CLCASTKERNEL_H__ +-#define __ARM_COMPUTE_CLCASTKERNEL_H__ +- +-#include "arm_compute/core/CL/ICLKernel.h" +-#include "arm_compute/core/TypesEx.h" +- +-namespace arm_compute +-{ +-class ICLTensor; +- +-/** +- * @brief Class to define OpenCL kernel for cast operation +- */ +-class CLCastKernel : public ICLKernel +-{ +-public: +- /** +- * @brief Construct CLCastKernel object +- */ +- CLCastKernel(); +- +- /** +- * @brief Prevent instances of this class from being copied (As this class contains pointers) +- */ +- CLCastKernel(const CLCastKernel &) = delete; +- +- /** +- * @brief Prevent instances of this class from being copied (As this class contains pointers) +- */ +- CLCastKernel &operator=(const CLCastKernel &) = delete; +- +- /** +- * @brief Construct CLCastKernel object using default move constructor +- * @param[in] CLCastKernel object to move +- */ +- CLCastKernel(CLCastKernel &&) = default; +- +- /** +- * @brief Allow instances of this class to be moved +- * @param[in] CLCastKernel object to move +- */ +- CLCastKernel &operator=(CLCastKernel &&) = default; +- +- /** +- * @brief Destruct this CLCastKernel object +- */ +- ~CLCastKernel() = default; +- +- /** +- * @brief Initialise the kernel's input and output. +- * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. +- * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. +- * @param[in] input_subtype Sub data type of input. +- * @return N/A +- */ +- void configure(const ICLTensor *input, ICLTensor *output, SubDataType input_subtype); +- +- /** +- * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command +- * queue. +- * @note The queue is *not* flushed by this method, and therefore the kernel will not have +- * been executed by the time this method returns. +- * @param[in] window Region on which to execute the kernel. (Must be a valid region of +- * the window returned by window()). +- * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A +- * @return N/A +- */ +- void run(const Window &window, cl::CommandQueue &queue) override; +- +-private: +- const ICLTensor *_input; /**< Source tensor */ +- ICLTensor *_output; /**< Destination tensor */ +-}; +-} // namespace arm_compute +-#endif /* __ARM_COMPUTE_CLCASTKERNEL_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h +deleted file mode 100644 +index 8615cf1..0000000 +--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h ++++ /dev/null +@@ -1,82 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ +-#define __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ +- +-#include "arm_compute/core/CL/ICLKernel.h" +- +-namespace arm_compute +-{ +-class ICLTensor; +- +-/** OpenCL kernel to perform depthTospace operation */ +-class CLDepthToSpaceKernel : public ICLKernel +-{ +-public: +- /** Default constructor */ +- CLDepthToSpaceKernel(); +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- CLDepthToSpaceKernel(const CLDepthToSpaceKernel &) = delete; +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- CLDepthToSpaceKernel &operator=(const CLDepthToSpaceKernel &) = delete; +- /** Allow instances of this class to be moved */ +- CLDepthToSpaceKernel(CLDepthToSpaceKernel &&) = default; +- /** Allow instances of this class to be moved */ +- CLDepthToSpaceKernel &operator=(CLDepthToSpaceKernel &&) = default; +- /** Default destructor */ +- ~CLDepthToSpaceKernel() = default; +- /** Initialise the kernel's input and output. +- * +- * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. +- * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. +- */ +- void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size); +- +- // Inherited methods overridden: +- void run(const Window &window, cl::CommandQueue &queue) override; +- +-private: +- const ICLTensor *_input; /**< Source tensor */ +- ICLTensor *_output; /**< Destination tensor */ +-}; +- +-} // namespace arm_compute +-#endif /* __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h +deleted file mode 100644 +index 9321c36..0000000 +--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h ++++ /dev/null +@@ -1,117 +0,0 @@ +-/* +- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2017-2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNELEX_H__ +-#define __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNELEX_H__ +- +-#include "arm_compute/core/CL/ICLKernel.h" +- +-namespace arm_compute +-{ +-class ICLTensor; +- +-/** OpenCL kernel to multiply matrices +- * +- * @note This kernel should be used ONLY for Midgard architectures +- * +- * This kernel performs the following computation: +- * +- * -# Convert a values from int8 to int32 +- * -# Convert b values from int8 to int32 +- * -# Compute the int32 matrix product of the resulting a * b and store the result as int32 +- * +- */ +-class CLGEMMLowpMatrixMultiplyKernelEx : public ICLKernel +-{ +-public: +- /** Default Constructor */ +- CLGEMMLowpMatrixMultiplyKernelEx(); +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- CLGEMMLowpMatrixMultiplyKernelEx(const CLGEMMLowpMatrixMultiplyKernelEx &) = delete; +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- CLGEMMLowpMatrixMultiplyKernelEx &operator=(const CLGEMMLowpMatrixMultiplyKernelEx &) = delete; +- /** Allow instances of this class to be moved */ +- CLGEMMLowpMatrixMultiplyKernelEx(CLGEMMLowpMatrixMultiplyKernelEx &&) = default; +- /** Allow instances of this class to be moved */ +- CLGEMMLowpMatrixMultiplyKernelEx &operator=(CLGEMMLowpMatrixMultiplyKernelEx &&) = default; +- /** Initialise the kernel's input and output. +- * +- * @note This kernel should be used ONLY for Midgard architectures +- * +- * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8 +- * @param[in] input1 Input tensor containing the RHS matrix. Data type supported: same as @p +- * input0 +- * @param[out] output Output tensor to store the result of matrix multiplication. Data type +- * supported: S32 +- * @param[in] gemm_info (Optional) GEMM information used to retrieve the original dimensions of +- * the input matrices +- */ +- void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, +- const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo()); +- /** Static function to check if given info will lead to a valid configuration of @ref +- * CLGEMMLowpMatrixMultiplyKernelEx +- * +- * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8 +- * @param[in] input1 Input tensor containing the RHS matrix. Data type supported: same as @p +- * input0 +- * @param[in] output Output tensor to store the result of matrix multiplication. Data type +- * supported: S32 +- * @param[in] gemm_info (Optional) GEMM information used to retrieve the original dimensions of +- * the input matrices +- * +- * @return a status +- */ +- static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, +- const ITensorInfo *output, +- const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo()); +- +- // Inherited methods overridden: +- void run(const Window &window, cl::CommandQueue &queue) override; +- +-private: +- const ICLTensor *_input0; +- const ICLTensor *_input1; +- ICLTensor *_output; +- bool _slide_matrix_b; +- bool _reinterpret_input_as_3d; +- bool _reinterpret_output_as_3d; +-}; +-} // namespace arm_compute +-#endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNELEX_H__*/ +diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h +deleted file mode 100644 +index dd2dbf6..0000000 +--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h ++++ /dev/null +@@ -1,83 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_CLPRELU_KERNEL_H__ +-#define __ARM_COMPUTE_CLPRELU_KERNEL_H__ +- +-#include "arm_compute/core/CL/ICLKernel.h" +- +-namespace arm_compute +-{ +-class ICLTensor; +- +-/** OpenCL kernel to calculate PReLU*/ +-class CLPReLUKernel : public ICLKernel +-{ +-public: +- /** Default constructor */ +- CLPReLUKernel(); +- /** Prevent instances of this class from being copied (As this class contains pointers). */ +- CLPReLUKernel(const CLPReLUKernel &) = delete; +- /** Prevent instances of this class from being copied (As this class contains pointers). */ +- CLPReLUKernel &operator=(const CLPReLUKernel &) = delete; +- /** Allow instances of this class to be moved */ +- CLPReLUKernel(CLPReLUKernel &&) = default; +- /** Allow instances of this class to be moved */ +- CLPReLUKernel &operator=(CLPReLUKernel &&) = default; +- /** Initialize the kernel's input, output. +- * +- * @param[in] input Source tensor1. +- * @param[in] alpha Source tensor2. +- * @param[out] output Output tensor. +- */ +- void configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output); +- +- // Inherited methods overridden: +- void run(const Window &window, cl::CommandQueue &queue) override; +- +- BorderSize border_size() const override; +- +-private: +- const ICLTensor *_input; +- const ICLTensor *_alpha; +- ICLTensor *_output; +-}; +-} // namespace arm_compute +-#endif /*__ARM_COMPUTE_CLPRELU_KERNEL_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h +deleted file mode 100644 +index 4c0a82c..0000000 +--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h ++++ /dev/null +@@ -1,82 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ +-#define __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ +- +-#include "arm_compute/core/CL/ICLKernel.h" +- +-namespace arm_compute +-{ +-class ICLTensor; +- +-/** OpenCL kernel to perform spaceTodepth operation */ +-class CLSpaceToDepthKernel : public ICLKernel +-{ +-public: +- /** Default constructor */ +- CLSpaceToDepthKernel(); +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- CLSpaceToDepthKernel(const CLSpaceToDepthKernel &) = delete; +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- CLSpaceToDepthKernel &operator=(const CLSpaceToDepthKernel &) = delete; +- /** Allow instances of this class to be moved */ +- CLSpaceToDepthKernel(CLSpaceToDepthKernel &&) = default; +- /** Allow instances of this class to be moved */ +- CLSpaceToDepthKernel &operator=(CLSpaceToDepthKernel &&) = default; +- /** Default destructor */ +- ~CLSpaceToDepthKernel() = default; +- /** Initialise the kernel's input and output. +- * +- * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. +- * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. +- */ +- void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size); +- +- // Inherited methods overridden: +- void run(const Window &window, cl::CommandQueue &queue) override; +- +-private: +- const ICLTensor *_input; /**< Source tensor */ +- ICLTensor *_output; /**< Destination tensor */ +-}; +- +-} // namespace arm_compute +-#endif /* __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h +deleted file mode 100644 +index 9d174de..0000000 +--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h ++++ /dev/null +@@ -1,109 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__ +-#define __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__ +- +-#include "arm_compute/core/CL/ICLKernel.h" +- +-namespace arm_compute +-{ +-class ICLTensor; +- +-/** Interface for the Upsampling layer kernel for transpose convolution on OpenCL. +- */ +-class CLTransposeConvLayerUpsampleKernel : public ICLKernel +-{ +-public: +- /** Constructor */ +- CLTransposeConvLayerUpsampleKernel(); +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- CLTransposeConvLayerUpsampleKernel(const CLTransposeConvLayerUpsampleKernel &) = delete; +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- CLTransposeConvLayerUpsampleKernel & +- operator=(const CLTransposeConvLayerUpsampleKernel &) = delete; +- /** Default Move Constructor. */ +- CLTransposeConvLayerUpsampleKernel(CLTransposeConvLayerUpsampleKernel &&) = default; +- /** Default move assignment operator */ +- CLTransposeConvLayerUpsampleKernel &operator=(CLTransposeConvLayerUpsampleKernel &&) = default; +- /** Default destructor */ +- ~CLTransposeConvLayerUpsampleKernel() = default; +- +- /** Initialise the kernel's input and output. +- * +- * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32. +- * @param[out] output Destination tensor. Data types supported: same as @p input. All but +- * the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only +- * performed within the XY-plane. +- * @param[in] inner_border Top and right inner border sizes. These rows and columns will be +- * filled with zero. +- * @param[in] info Contains padding and stride information described in @ref +- * PadStrideInfo. +- */ +- void configure(const ICLTensor *input, ICLTensor *output, const BorderSize &inner_border, +- const PadStrideInfo &info); +- /** Static function to check if given info will lead to a valid configuration of @ref +- * CLTransposeConvLayerUpsample +- * +- * @param[in] input Source tensor info. Data types supported: QASYMM8/F16/F32. +- * @param[in] output Destination tensor info. Data types supported: same as @p input. All +- * but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is +- * only performed within the XY-plane. +- * @param[in] inner_border Top and right inner border sizes. These rows and columns will be filled +- * with zero. +- * @param[in] info Contains padding and stride information described in @ref +- * PadStrideInfo. +- * +- * @return a status +- */ +- static Status validate(const ITensorInfo *input, const ITensorInfo *output, +- const BorderSize &inner_border, const PadStrideInfo &info); +- +- // Inherited methods overridden: +- void run(const Window &window, cl::CommandQueue &queue) override; +- +-private: +- const ICLTensor *_input; +- ICLTensor *_output; +- BorderSize _inner_border; +- PadStrideInfo _info; +-}; +-} // namespace arm_compute +-#endif /*__ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h +deleted file mode 100644 +index d4c9c61..0000000 +--- a/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h ++++ /dev/null +@@ -1,88 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2017-2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__ +-#define __ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__ +- +-#include "arm_compute/core/CPP/ICPPKernel.h" +- +-namespace arm_compute +-{ +-class ITensor; +- +-/** CPP kernel to perform tensor upsample. +- * +- */ +-class CPPUpsampleKernelEx : public ICPPKernel +-{ +-public: +- const char *name() const override { return "CPPUpsampleKernelEx"; } +- /** Default constructor */ +- CPPUpsampleKernelEx(); +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- CPPUpsampleKernelEx(const CPPUpsampleKernelEx &) = delete; +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- CPPUpsampleKernelEx &operator=(const CPPUpsampleKernelEx &) = delete; +- /** Allow instances of this class to be moved */ +- CPPUpsampleKernelEx(CPPUpsampleKernelEx &&) = default; +- /** Allow instances of this class to be moved */ +- CPPUpsampleKernelEx &operator=(CPPUpsampleKernelEx &&) = default; +- /** Default destructor */ +- ~CPPUpsampleKernelEx() = default; +- +- /** Set the input and output of the kernel. +- * +- * @param[in] input The input tensor to upsample. Data types supported: F32/F16/QASYMM8 +- * @param[out] output The output tensor. Data types supported: Same as @p input +- * @param[in] info Padding info. +- */ +- void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info); +- +- // Inherited methods overridden: +- void run(const Window &window, const ThreadInfo &info) override; +- bool is_parallelisable() const override; +- +-private: +- const ITensor *_input; +- ITensor *_output; +- PadStrideInfo _info; +-}; +-} // namespace arm_compute +-#endif /*__ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h +deleted file mode 100644 +index 4e9f097..0000000 +--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h ++++ /dev/null +@@ -1,96 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2017-2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_NECASTKERNEL_H__ +-#define __ARM_COMPUTE_NECASTKERNEL_H__ +- +-#include "arm_compute/core/NEON/INEKernel.h" +-#include "arm_compute/core/TypesEx.h" +- +-namespace arm_compute +-{ +-class ITensor; +- +-/** Interface for the cast layer kernel. */ +-class NECastKernel : public INEKernel +-{ +-public: +- const char *name() const override { return "NECastKernel"; } +- /** Default constructor */ +- NECastKernel(); +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- NECastKernel(const NECastKernel &) = delete; +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- NECastKernel &operator=(const NECastKernel &) = delete; +- /** Default Move Constructor. */ +- NECastKernel(NECastKernel &&) = default; +- /** Default move assignment operator */ +- NECastKernel &operator=(NECastKernel &&) = default; +- /** Default destructor */ +- ~NECastKernel() = default; +- /** Set input, output tensors. +- * +- * @param[in] input Source tensor. Data type supported: U8/S8/QASYMM8/U32/S32/F32. +- * @param[out] output Destination tensor with the same dimensions of input. Data type supported: +- * U8/S8/QASYMM8/U32/S32/F32. +- * @param[in] input_subtype Sub data type of input. +- */ +- void configure(const ITensor *input, ITensor *output, SubDataType input_subtype); +- /** Static function to check if given info will lead to a valid configuration of @ref NECastKernel +- * +- * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32. +- * @param[in] output Output tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32. +- * @param[in] input_subtype Sub data type of input. +- * +- * @return a status +- */ +- static Status validate(const ITensorInfo *input, const ITensorInfo *output, +- SubDataType input_subtype); +- +- // Inherited methods overridden: +- void run(const Window &window, const ThreadInfo &info) override; +- +-private: +- const ITensor *_input; +- ITensor *_output; +- SubDataType _input_subtype; +-}; +-} // namespace arm_compute +-#endif /*__ARM_COMPUTE_NECASTKERNEL_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h +deleted file mode 100644 +index b62897e..0000000 +--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h ++++ /dev/null +@@ -1,96 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__ +-#define __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__ +- +-#include "arm_compute/core/NEON/INEKernel.h" +- +-namespace arm_compute +-{ +-class ITensor; +- +-/** Interface for the depth to space kernel */ +-class NEDepthToSpaceLayerKernelEx : public INEKernel +-{ +-public: +- const char *name() const override { return "NEDepthToSpaceLayerKernelEx"; } +- /** Default constructor */ +- NEDepthToSpaceLayerKernelEx(); +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- NEDepthToSpaceLayerKernelEx(const NEDepthToSpaceLayerKernelEx &) = delete; +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- NEDepthToSpaceLayerKernelEx &operator=(const NEDepthToSpaceLayerKernelEx &) = delete; +- /** Allow instances of this class to be moved */ +- NEDepthToSpaceLayerKernelEx(NEDepthToSpaceLayerKernelEx &&) = default; +- /** Allow instances of this class to be moved */ +- NEDepthToSpaceLayerKernelEx &operator=(NEDepthToSpaceLayerKernelEx &&) = default; +- /** Default destructor */ +- ~NEDepthToSpaceLayerKernelEx() = default; +- /** Initialise the kernel's inputs and output. +- * +- * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: +- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. +- * @param[out] output Tensor output. Data types supported: same as @p input +- * @param[in] block_shape Block shape x value. +- */ +- void configure(const ITensor *input, ITensor *output, int32_t block_shape); +- /** Static function to check if given info will lead to a valid configuration of @ref +- * NEDepthToSpaceLayerKernelEx. +- * +- * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: +- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. +- * @param[in] output Tensor output info. Data types supported: same as @p input +- * @param[in] block_shape Block shape value. +- * +- * @return a status +- */ +- static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape); +- +- // Inherited methods overridden: +- void run(const Window &window, const ThreadInfo &info) override; +- +-private: +- const ITensor *_input; /**< Source tensor */ +- ITensor *_output; /**< Destination tensor */ +- int32_t _block_shape; /**< Block shape */ +-}; +-} // namespace arm_compute +-#endif /* __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h +deleted file mode 100644 +index 57de78d..0000000 +--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h ++++ /dev/null +@@ -1,118 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2018-2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__ +-#define __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__ +- +-#include "arm_compute/core/NEON/INEKernel.h" +-#include "arm_compute/core/TypesEx.h" +- +-namespace arm_compute +-{ +-class ITensor; +- +-/** Interface for an element-wise unary operation kernel +- * +- * Element-wise operation is computed by: +- * @f[ output(x) = OP(input(x))@f] +- * +- */ +-class NEElementwiseUnaryKernelEx : public INEKernel +-{ +-public: +- const char *name() const override { return "NEElementwiseUnaryKernelEx"; } +- /** Default constructor */ +- NEElementwiseUnaryKernelEx(); +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- NEElementwiseUnaryKernelEx(const NEElementwiseUnaryKernelEx &) = delete; +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- NEElementwiseUnaryKernelEx &operator=(const NEElementwiseUnaryKernelEx &) = delete; +- /** Allow instances of this class to be moved */ +- NEElementwiseUnaryKernelEx(NEElementwiseUnaryKernelEx &&) = default; +- /** Allow instances of this class to be moved */ +- NEElementwiseUnaryKernelEx &operator=(NEElementwiseUnaryKernelEx &&) = default; +- /** Default destructor */ +- ~NEElementwiseUnaryKernelEx() = default; +- +- /** Static function to check if given info will lead to a valid configuration of @ref +- * NEElementwiseUnaryKernelEx +- * +- * @param[in] op Arithmetic operation to be executed. +- * @param[in] input First tensor input. Data types supported: F16/F32/S32. +- * @param[in] output Output tensor. Data types supported: Same as @p input. +- */ +- void configure(ElementWiseUnaryEx op, const ITensor *input, ITensor *output); +- +- /** Static function to check if given info will lead to a valid configuration of @ref +- * NEElementwiseUnaryKernelEx +- * +- * @param[in] op Arithmetic operation to be executed. +- * @param[in] input First tensor input info. Data types supported: F16/F32/S32. +- * @param[in] output Output tensor info. Data types supported: Same as @p input. +- * +- * @return a Status +- */ +- static Status validate(ElementWiseUnaryEx op, const ITensorInfo *input, +- const ITensorInfo *output); +- +- // Inherited methods overridden: +- void run(const Window &window, const ThreadInfo &info) override; +- +- /** Common signature for all the specialised arithmetic functions +- * +- * @param[in] input An input tensor. Data types supported: F16/F32/S32. +- * @param[out] output The output tensor. Data types supported: Same as @p input. +- * @param[in] window Region on which to execute the kernel. +- */ +- using ElementwiseUnaryFunction = void(const ITensor *input, ITensor *output, +- const Window &window); +- +-protected: +- // Inherited methods overridden: +- static Status validate_arguments(const ITensorInfo &input, const ITensorInfo &output); +- +- /** Function to use for the particular tensor types passed to configure() */ +- std::function _function; +- +- const ITensor *_input; +- ITensor *_output; +-}; +-} // namespace arm_compute +-#endif /* __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h +deleted file mode 100644 +index 722efd3..0000000 +--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h ++++ /dev/null +@@ -1,100 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_NEPRELUKERNEL_H__ +-#define __ARM_COMPUTE_NEPRELUKERNEL_H__ +- +-#include "arm_compute/core/NEON/INEKernel.h" +- +-namespace arm_compute +-{ +-class ITensor; +- +-/** Interface for the kernel to perform Parametric Rectified Linear Unit +- * +- * Result is computed by: +- * @f[ output(x) = alpha * x for x < 0, output(x) = x for x >= 0 @f] +- */ +-class NEPReLUKernel : public INEKernel +-{ +-public: +- const char *name() const override { return "NEPReLUKernel"; } +- /** Default constructor */ +- NEPReLUKernel(); +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- NEPReLUKernel(const NEPReLUKernel &) = delete; +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- NEPReLUKernel &operator=(const NEPReLUKernel &) = delete; +- /** Allow instances of this class to be moved */ +- NEPReLUKernel(NEPReLUKernel &&) = default; +- /** Allow instances of this class to be moved */ +- NEPReLUKernel &operator=(NEPReLUKernel &&) = default; +- /** Initialise the kernel's inputs and output +- * +- * @param[in] input Input tensor. Data type supported: QASYMM8/F32 +- * @param[in] alpha Alpha tensor. Data types supported: Same as @p input +- * @param[out] output Output tensor. Data types supported: Same as @p input +- */ +- void configure(const ITensor *input, const ITensor *alpha, ITensor *output); +- +- // Inherited methods overridden: +- void run(const Window &window, const ThreadInfo &info) override; +- +- /** Static function to check if given info will lead to a valid configuration of @ref +- * NEPReLUKernel.h +- * +- * @param[in] input Input tensor input info. Data types supported: QASYMM8/F32. +- * @param[in] alpha Alpha tensor input info. Data types supported: Same as @p input. +- * @param[in] output Output tensor info. Data types supported: Same as @p input. +- * +- * @return a Status +- */ +- static Status validate(const ITensorInfo *input, const ITensorInfo *alpha, +- const ITensorInfo *output); +- static Status validate_arguments(const ITensorInfo &input, const ITensorInfo &alpha, +- const ITensorInfo &output); +- +-private: +- const ITensor *_input; /**< Source tensor */ +- const ITensor *_alpha; /**< Alpha tensor */ +- ITensor *_output; /**< Destination tensor */ +-}; +-} // namespace arm_compute +-#endif /* __ARM_COMPUTE_NEPRELUKERNEL_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h +deleted file mode 100644 +index 0ffcf6b..0000000 +--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h ++++ /dev/null +@@ -1,97 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__ +-#define __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__ +- +-#include "arm_compute/core/NEON/INEKernel.h" +-#include "arm_compute/core/Types.h" +- +-namespace arm_compute +-{ +-class ITensor; +- +-/** Interface for the space to depth kernel */ +-class NESpaceToDepthLayerKernelEx : public INEKernel +-{ +-public: +- const char *name() const override { return "NESpaceToDepthLayerKernelEx"; } +- /** Default constructor */ +- NESpaceToDepthLayerKernelEx(); +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- NESpaceToDepthLayerKernelEx(const NESpaceToDepthLayerKernelEx &) = delete; +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- NESpaceToDepthLayerKernelEx &operator=(const NESpaceToDepthLayerKernelEx &) = delete; +- /** Allow instances of this class to be moved */ +- NESpaceToDepthLayerKernelEx(NESpaceToDepthLayerKernelEx &&) = default; +- /** Allow instances of this class to be moved */ +- NESpaceToDepthLayerKernelEx &operator=(NESpaceToDepthLayerKernelEx &&) = default; +- /** Default destructor */ +- ~NESpaceToDepthLayerKernelEx() = default; +- /** Initialise the kernel's inputs and output. +- * +- * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: +- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. +- * @param[out] output Tensor output. Data types supported: same as @p input +- * @param[in] block_shape Block shape value +- */ +- void configure(const ITensor *input, ITensor *output, int32_t block_shape); +- /** Static function to check if given info will lead to a valid configuration of @ref +- * NESpaceToDepthLayerKernelEx +- * +- * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: +- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. +- * @param[in] output Tensor output info. Data types supported: same as @p input +- * @param[in] block_shape Block shape value +- * +- * @return a status +- */ +- static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape); +- +- // Inherited methods overridden: +- void run(const Window &window, const ThreadInfo &info) override; +- +-private: +- const ITensor *_input; /**< Source tensor */ +- ITensor *_output; /**< Destination tensor */ +- int32_t _block_shape; /**< Block shape */ +-}; +-} // namespace arm_compute +-#endif /* __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h +index 97bc4ce..cfbd134 100644 +--- a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h ++++ b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h +@@ -16,25 +16,14 @@ + #ifndef __ARM_COMPUTE_CLFUNCTIONSEX_H__ + #define __ARM_COMPUTE_CLFUNCTIONSEX_H__ + +-#include +-#include + #include +-#include +-#include + #include + #include + #include + #include + #include +-#include + #include +-#include +-#include + #include +-#include +-#include +-#include +-#include + #include + #include + +diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h +deleted file mode 100644 +index c37096f..0000000 +--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h ++++ /dev/null +@@ -1,129 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2017 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-/** +- * @file CLArgOperation.h +- * @ingroup COM_AI_RUNTIME +- * @brief This file contains arm_compute::CLArgOperation class +- */ +- +-#ifndef __ARM_COMPUTE_CLARGOPERATION_H__ +-#define __ARM_COMPUTE_CLARGOPERATION_H__ +- +-#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h" +-#include "arm_compute/runtime/CL/CLTensor.h" +-#include "arm_compute/runtime/IFunction.h" +-#include "arm_compute/core/TypesEx.h" +- +-namespace arm_compute +-{ +-class ICLTensor; +- +-/** +- * @brief Class to execute CLArgOperation operation +- */ +-class CLArgOperation : public IFunction +-{ +-public: +- /** +- * @brief Construct a new CLArgOperation object +- */ +- CLArgOperation(); +- +- /** +- * @brief Prevent instances of this class from being copied (As this class contains pointers) +- */ +- CLArgOperation(const CLArgOperation &) = delete; +- +- /** +- * @brief Prevent instances of this class from being copied (As this class contains pointers) +- */ +- CLArgOperation &operator=(const CLArgOperation &) = delete; +- +- /** +- * @brief Construct a new CLArgOperation object by using copy constructor +- * @param[in] CLArgOperation object to move +- */ +- CLArgOperation(CLArgOperation &&) = default; +- +- /** +- * @brief Assign a CLArgOperation object. +- * @param[in] CLArgOperation object to assign. This object will be moved. +- */ +- CLArgOperation &operator=(CLArgOperation &&) = default; +- +- /** +- * @brief Initialise the kernel's inputs and outputs. +- * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S32/F32. +- * @param[out] output The result of arg operation. Data types supported: S32. +- * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. +- * @param[in] op Arg operation to perform. +- * @return N/A +- */ +- void configure(ICLTensor *input, ICLTensor *output, std::vector axis, ArgOperation op); +- +- /** +- * @brief Static function to check if given info will lead to a valid configuration +- * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S32/F32. +- * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. +- * @param[out] output The result of arg operation. Data types supported: S32. +- * @param[in] op Arg operation to perform. +- * @return a status +- */ +- static Status validate(const ITensorInfo *input, const std::vector &axis, +- const ITensorInfo *output, ArgOperation op); +- /** +- * @brief Run the OpenCL kernel for this operation +- * @return N/A +- */ +- void run() override; +- +-private: +- ICLTensor *_input{nullptr}; +- ICLTensor *_output{nullptr}; +- std::vector _axis{}; +- ArgOperation _arg_op{ArgOperation::MAX}; +- +- std::unique_ptr _interm_tensors{nullptr}; +- std::unique_ptr _argop_kernels{nullptr}; +- size_t _num_of_kernels{0}; +-}; +-} +-#endif /*__ARM_COMPUTE_CLARGOPERATION_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h +deleted file mode 100644 +index eed5cb8..0000000 +--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h ++++ /dev/null +@@ -1,69 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ +-#define __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ +- +-#include "arm_compute/runtime/CL/ICLSimpleFunction.h" +- +-namespace arm_compute +-{ +-class ICLTensor; +- +-/** Basic function to run @ref CLBatchToSpaceNDKernel +- * +- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. +- * @note The function converts the input tensor to the tensor of the output tensor's type. +- */ +-class CLBatchToSpaceND : public ICLSimpleFunction +-{ +-public: +- /** Initialise the kernel's input and output. +- * +- * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. +- * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. +- * @param[in] block_size A pointer to an array of integer values specifying block sizes +- * for spatial dimension. +- */ +- void configure(ICLTensor *input, ICLTensor *output, const int32_t *block_size); +-}; +- +-} // namespace arm_compute +-#endif /* __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h +deleted file mode 100644 +index ebe0d8a..0000000 +--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h ++++ /dev/null +@@ -1,75 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-/** +- * @file CLCast.h +- * @ingroup COM_AI_RUNTIME +- * @brief This file contains arm_compute::CLCast class +- */ +- +-#ifndef __ARM_COMPUTE_CLCAST_H__ +-#define __ARM_COMPUTE_CLCAST_H__ +- +-#include "arm_compute/core/TypesEx.h" +-#include "arm_compute/runtime/CL/ICLSimpleFunction.h" +- +-namespace arm_compute +-{ +-class ICLTensor; +- +-/** +- * @brief Class to run @ref CLCastKernel. +- * This converts the input tensor to the tensor of the output tensor's type. +- */ +-class CLCast : public ICLSimpleFunction +-{ +-public: +- /** +- * @brief Initialise the kernel's input and output +- * @param[in, out] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. +- * The input tensor is [in, out] because its TensorInfo might be +- * modified inside the kernel. +- * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. +- * @param[in] input_subtype Sub data type of input. +- */ +- void configure(ICLTensor *input, ICLTensor *output, SubDataType input_subtype); +-}; +-} +-#endif /* __ARM_COMPUTE_CLCAST_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h +deleted file mode 100644 +index d52a538..0000000 +--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h ++++ /dev/null +@@ -1,68 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_CLDEPTHTOSPACE_H__ +-#define __ARM_COMPUTE_CLDEPTHTOSPACE_H__ +- +-#include "arm_compute/runtime/CL/ICLSimpleFunction.h" +- +-namespace arm_compute +-{ +-class ICLTensor; +- +-/** Basic function to run @ref CLDepthToSpaceKernel +- * +- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. +- * @note The function converts the input tensor to the tensor of the output tensor's type. +- */ +-class CLDepthToSpace : public ICLSimpleFunction +-{ +-public: +- /** Initialise the kernel's input and output. +- * +- * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. +- * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. +- * @param[block_size] block size integer only +- */ +- void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size); +-}; +-} // namesace arm_compute +- +-#endif /* __ARM_COMPUTE_CLDEPTHTOSPACE_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h +new file mode 100644 +index 0000000..409eaf5 +--- /dev/null ++++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h +@@ -0,0 +1,201 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++/* ++ * Copyright (c) 2019-2020 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#ifndef __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__ ++#define __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__ ++ ++#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h" ++#include "arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h" ++#include "arm_compute/runtime/CL/functions/CLReverse.h" ++#include "arm_compute/runtime/CL/functions/CLTranspose.h" ++ ++#include "arm_compute/runtime/CL/CLTensor.h" ++#include "arm_compute/runtime/IFunction.h" ++#include "arm_compute/runtime/IMemoryManager.h" ++#include "arm_compute/runtime/MemoryGroup.h" ++ ++#include ++ ++namespace arm_compute ++{ ++class ICLTensor; ++/** Function to run the deconvolution layer. ++ * ++ * Deconvolution Layer is the backward pass of Convolution Layer. First we transform the input ++ * depending on the stride and pad info and then perform a 1x1 ++ * convolution pass. Input stride defines how many zeroes we should put between each element of the ++ * input and pad is the amount of padding. ++ * ++ * The relation between input to output is as follows: ++ * \f[ ++ * width\_output = (width\_input - 1) \cdot stride\_x - 2 \cdot padding\_x + kernel\_x ++ * \f] ++ * \f[ ++ * height\_output = (height\_input - 1) \cdot stride\_y - 2 \cdot padding\_y + kernel\_y ++ * \f] ++ * ++ * where: ++ * width_input is the size of the first input dimension. ++ * height_input is the size of the second input dimension. ++ * width_output is the size of the first output dimension. ++ * height_output is the size of the second output dimension. ++ * kernel_x and kernel_y are the convolution sizes in x and y. ++ * stride_x and stride_y is the input stride of the first and second dimension. ++ * ++ * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution. ++ * Therefore, it will be necessary to use the weights in the ++ * reverse order to perform an actual convolution. This is achieved by using @ref CLReverse. ++ * ++ * This function calls the following OpenCL kernels/functions: ++ * ++ * -# @ref CLDeconvolutionLayerUpsample ++ * -# @ref CLConvolutionLayer ++ * ++ * And the following CPP kernels: ++ * -# @ref CLReverse ++ * ++ */ ++class CLDirectTransposeConvLayer : public IFunction ++{ ++public: ++ /** Constructor */ ++ CLDirectTransposeConvLayer(std::shared_ptr memory_manager = nullptr); ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLDirectTransposeConvLayer(const CLDirectTransposeConvLayer &) = delete; ++ /** Default move constructor */ ++ CLDirectTransposeConvLayer(CLDirectTransposeConvLayer &&) = default; ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLDirectTransposeConvLayer &operator=(const CLDirectTransposeConvLayer &) = delete; ++ /** Default move assignment operator */ ++ CLDirectTransposeConvLayer &operator=(CLDirectTransposeConvLayer &&) = default; ++ /** Set the input, weights, biases and output tensors. ++ * ++ * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an ++ * optional 4th dimension for batch of inputs. ++ * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. ++ * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type ++ * supported: Same as @p input. ++ * @param[in] bias (Optional) The biases have one dimension. ++ * Data type supported: Should match @p input data type, except for ++ * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type ++ * @param[out] output Output tensor. The output has the same number of dimensions as the ++ * @p input. ++ * @param[in] info Contains padding and policies to be used in the deconvolution, this ++ * is decribed in @ref PadStrideInfo. ++ * @param[in] invalid_right The number of zeros added to right edge of the output. ++ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. ++ * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, ++ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. ++ * ++ */ ++ void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, ++ const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom, ++ const WeightsInfo &weights_info = WeightsInfo()); ++ /** Set the input, weights, biases and output tensors. ++ * ++ * @param[in] compile_context The compile context to be used. ++ * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and ++ * an optional 4th dimension for batch of inputs. ++ * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. ++ * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data ++ * type supported: Same as @p input. ++ * @param[in] bias (Optional) The biases have one dimension. ++ * Data type supported: Should match @p input data type, except for ++ * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type ++ * @param[out] output Output tensor. The output has the same number of dimensions as ++ * the @p input. ++ * @param[in] info Contains padding and policies to be used in the deconvolution, ++ * this is decribed in @ref PadStrideInfo. ++ * @param[in] invalid_right The number of zeros added to right edge of the output. ++ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. ++ * @param[in] weights_info (Optional) Weights information needed for @ref ++ * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref ++ * CLWeightsReshapeKernel. ++ * ++ */ ++ void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, ++ const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info, ++ unsigned int invalid_right, unsigned int invalid_bottom, ++ const WeightsInfo &weights_info = WeightsInfo()); ++ /** Static function to check if given info will lead to a valid configuration of @ref ++ * CLDirectTransposeConvLayer ++ * ++ * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an ++ * optional 4th dimension for batch of inputs. ++ * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. ++ * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data ++ * type supported: Same as @p input. ++ * @param[in] bias (Optional) The biases have one dimension. ++ * Data type supported: Should match @p input data type, except for input ++ * of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type ++ * @param[in] output Output tensor info. The output has the same number of dimensions as the ++ * @p input. ++ * @param[in] info Contains padding and policies to be used in the deconvolution, this is ++ * decribed in @ref PadStrideInfo. ++ * @param[in] invalid_right The number of zeros added to right edge of the output. ++ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. ++ * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, ++ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. ++ * ++ * @return a status ++ */ ++ static Status validate(const ITensorInfo *input, const ITensorInfo *weights, ++ const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info, ++ unsigned int invalid_right, unsigned int invalid_bottom, ++ const WeightsInfo &weights_info = WeightsInfo()); ++ ++ // Inherited methods overridden: ++ void run() override; ++ void prepare() override; ++ ++private: ++ MemoryGroup _memory_group; ++ CLDeconvolutionLayerUpsample _scale_f; ++ CLConvolutionLayer _conv_f; ++ CLReverse _flip_weights; ++ ++ CLTensor _scaled_output; ++ ICLTensor *_original_weights; ++ CLTensor _weights_flipped; ++ CLTensor _flip_axis; ++ ++ bool _is_prepared; ++}; ++} // namespace arm_compute ++#endif /* __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h +index 1a0284a..f3266f6 100644 +--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h ++++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h +@@ -50,7 +50,7 @@ + #include "arm_compute/core/CL/kernels/CLTransposeKernel.h" + #include "arm_compute/runtime/MemoryGroup.h" + #include "arm_compute/runtime/CL/CLTensor.h" +-#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h" ++#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" + + namespace arm_compute + { +@@ -168,7 +168,7 @@ private: + CLFullyConnectedHybridLayerReshapeWeights _reshape_weights_kernel; + CLScaleFactorSymm8Kernel _scale_factor_kernel; + CLQuantizationSymmetricKernel _quant_input_kernel; +- CLGEMMLowpMatrixMultiplyCoreEx _mm_gemmlowp; ++ CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp; + CLMultiplyScaleFactorKernel _multiply_scale_kernel; + CLGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; // TODO(COMPMID-1889): Use CLGEMM to + // add bias in +diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h +deleted file mode 100644 +index 68aba74..0000000 +--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h ++++ /dev/null +@@ -1,142 +0,0 @@ +-/* +- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2017-2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCOREEX_H__ +-#define __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCOREEX_H__ +- +-#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h" +-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h" +-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" +-#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h" +-#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h" +-#include "arm_compute/runtime/CL/CLTensor.h" +-#include "arm_compute/runtime/IFunction.h" +-#include "arm_compute/runtime/MemoryGroup.h" +- +-namespace arm_compute +-{ +-class IMemoryManager; +-class ICLTensor; +- +-/** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. This function calls the +- * following OpenCL kernels: +- * +- * -# @ref CLGEMMLowpMatrixMultiplyKernel (if the parameter "reshape_b_only_on_first_run" of +- * GEMMInfo is FALSE) +- * -# @ref CLGEMMLowpMatrixAReductionKernel (if the offset of matrix B is not 0) +- * -# @ref CLGEMMLowpMatrixBReductionKernel (if the offset of matrix A is not 0) +- * +-*/ +-class CLGEMMLowpMatrixMultiplyCoreEx : public IFunction +-{ +-public: +- /** Constructor */ +- CLGEMMLowpMatrixMultiplyCoreEx(std::shared_ptr memory_manager = nullptr); +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- CLGEMMLowpMatrixMultiplyCoreEx(const CLGEMMLowpMatrixMultiplyCoreEx &) = delete; +- /** Default move constructor */ +- CLGEMMLowpMatrixMultiplyCoreEx(CLGEMMLowpMatrixMultiplyCoreEx &&) = default; +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- CLGEMMLowpMatrixMultiplyCoreEx &operator=(const CLGEMMLowpMatrixMultiplyCoreEx &) = delete; +- /** Default move assignment operator */ +- CLGEMMLowpMatrixMultiplyCoreEx &operator=(CLGEMMLowpMatrixMultiplyCoreEx &&) = default; +- /** Initialise the kernel's inputs, output +- * +- * @note GEMMLowp: low precision GEMM kernel. [A * B + C] +- * This kernel performs the following computations: +- * +- * -# Convert a values from QASYMM8 to int32 and add a_offset to each of them. +- * -# Convert b values from QASYMM8 to int32 and add b_offset to each of them. +- * -# Compute the matrix product of the resulting a * b in int32. +- * -# Quantize to uint8 if gemm_info.gemmlowp_output_stage != NONE +- * +- * @param[in] a First input tensor (Matrix A). Data type supported: QASYMM8. +- * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a +- * @param[in] c Third input tensor (Matrix C). It can be a nullptr. Data type supported: +- * S32 +- * @param[out] output Output tensor. Data type supported: S32 or QASYMM8 if +- * gemm_info.gemmlowp_output_stage != NONE +- * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped +- * and +- * if the reshape of matrix B should be executed only for the first run +- */ +- void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, +- const GEMMInfo &gemm_info = GEMMInfo()); +- /** Static function to check if given info will lead to a valid configuration of @ref +- * CLGEMMLowpMatrixMultiplyCoreEx +- * +- * @param[in] a First input tensor info (Matrix A). Data type supported: QASYMM8. +- * @param[in] b Second input tensor info (Matrix B). Data type supported: same as @p a +- * @param[in] c Third input tensor info (Matrix C). It can be a nullptr. Data type +- * supported: S32 +- * @param[in] output Output tensor info. Data type supported: S32 or QASYMM8 if +- * gemm_info.gemmlowp_output_stage != NONE +- * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped +- * and +- * if the reshape of matrix B should be executed only for the first run +- * +- * @return a status +- */ +- static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, +- const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo()); +- +- // Inherited methods overridden: +- void run() override; +- void prepare() override; +- +-private: +- MemoryGroup _memory_group; +- +- // Kernels used +- CLGEMMLowpMatrixMultiplyKernelEx _mm_midgard_kernel; +- CLGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel; +- CLGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel; +- +- // Temporary tensors +- CLTensor _vector_sum_col; +- CLTensor _vector_sum_row; +- +- int32_t _a_offset; +- int32_t _b_offset; +- bool _reshape_b_only_on_first_run; +- bool _is_prepared; +-}; +-} // namespace arm_compute +-#endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCOREEX_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h +deleted file mode 100644 +index 5121671..0000000 +--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h ++++ /dev/null +@@ -1,62 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_CLLOGICALNOT_H__ +-#define __ARM_COMPUTE_CLLOGICALNOT_H__ +- +-#include "arm_compute/runtime/CL/ICLSimpleFunction.h" +- +-namespace arm_compute +-{ +-class ICLTensor; +- +-class CLLogicalNot : public ICLSimpleFunction +-{ +-public: +- /** Initialise the function's source and destination. +- * +- * @param[in] input Source tensor. Data types supported: QASYMM8. +- * @param[out] output Output tensor. Data types supported: QASYMM8. +- */ +- void configure(ICLTensor *input, ICLTensor *output); +-}; +- +-} // namespace arm_compute +-#endif /*__ARM_COMPUTE_CLLOGICALNOT_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h +deleted file mode 100644 +index 7fbe558..0000000 +--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h ++++ /dev/null +@@ -1,64 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_CLPRELU_H__ +-#define __ARM_COMPUTE_CLPRELU_H__ +- +-#include "arm_compute/runtime/CL/ICLSimpleFunction.h" +- +-namespace arm_compute +-{ +-class ICLTensor; +- +-class CLPReLU : public ICLSimpleFunction +-{ +-public: +- /** Initialise the function's source and destination. +- * +- * @param[in] input. Data types supported: +- * QASYMM8/F16/F32. +- * @param[in] alpha. Data types supported: +- * QASYMM8/F16/F32. +- * @param[out] output Output tensor. Data types supported: Same as @p input. +- */ +- void configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output); +-}; +-} // namespace arm_compute +-#endif /*__ARM_COMPUTE_CLPRELU_H__*/ +diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h +deleted file mode 100644 +index e83fb01..0000000 +--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h ++++ /dev/null +@@ -1,103 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-/** +- * @file CLPixelWiseDivision.h +- * @ingroup COM_AI_RUNTIME +- * @brief This file contains arm_compute::CLPixelWiseDivision class +- */ +-#ifndef __ARM_COMPUTE_CLPIXELWISEDIVISION_H__ +-#define __ARM_COMPUTE_CLPIXELWISEDIVISION_H__ +- +-#include "arm_compute/runtime/CL/ICLSimpleFunction.h" +- +-namespace arm_compute +-{ +-class ICLTensor; +- +-/** +- * @brief Class to run @ref CLPixelWiseDivisionKernel. +- */ +-class CLPixelWiseDivision : public ICLSimpleFunction +-{ +-public: +- /** +- * @brief Initialise the kernel's inputs, output and convertion policy. +- * @param[in, out] input1 An input tensor. Data types supported: U8/S16/F16/F32 +- * The input tensor is [in, out] because its TensorInfo might be +- * modified inside the kernel in case of broadcasting of dimension 0. +- * @param[in, out] input2 An input tensor. Data types supported: same as @p input1. +- * The input tensor is [in, out] because its TensorInfo might be +- * modified inside the kernel in case of broadcasting of dimension 0. +- * @param[out] output The output tensor, Data types supported: same as @p input1. +- * Note: U8 requires both inputs to be U8. +- * @param[in] scale Scale to apply after multiplication. +- * Scale must be positive and its value must be either 1/255 or +- * 1/2^n where n is between 0 and 15. +- * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate +- * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest +- * even. +- * @return N/A +- */ +- void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale = 1.f, +- ConvertPolicy overflow_policy = ConvertPolicy::WRAP, +- RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO); +- +- /** +- * @brief Static function to check if given info will lead to a valid configuration of @ref +- * CLPixelWiseDivision +- * @param[in] input1 An input tensor info. Data types supported: U8/S16/F16/F32 +- * @param[in] input2 An input tensor info. Data types supported: same as @p input1. +- * @param[in] output The output tensor info, Data types supported: same as @p input1. +- * Note: U8 requires both inputs to be U8. +- * @param[in] scale Scale to apply after multiplication. +- * Scale must be positive and its value must be either 1/255 or 1/2^n +- * where n is between 0 and 15. +- * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate +- * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. +- * @return a status +- */ +- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, +- const ITensorInfo *output, float scale = 1.f, +- ConvertPolicy overflow_policy = ConvertPolicy::WRAP, +- RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO); +-}; +-} +-#endif /*__ARM_COMPUTE_CLPIXELWISEDIVISION_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h +deleted file mode 100644 +index b49cbd8..0000000 +--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h ++++ /dev/null +@@ -1,120 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_CLRNN_LAYER_EX_H__ +-#define __ARM_COMPUTE_CLRNN_LAYER_EX_H__ +- +-#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h" +-#include "arm_compute/core/CL/kernels/CLCopyKernel.h" +-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h" +-#include "arm_compute/runtime/CL/ICLSimpleFunction.h" +-#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h" +-#include "arm_compute/runtime/CL/functions/CLGEMM.h" +- +-namespace arm_compute +-{ +-class ICLTensor; +- +-/** Basic function to run @ref CLRNNLayerEx */ +-class CLRNNLayerEx : public IFunction +-{ +-public: +- /** Default constructor */ +- CLRNNLayerEx(std::shared_ptr memory_manager = nullptr); +- /** Initialize the function +- * +- * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data +- * types supported: F16/F32 +- * @param[in] weights Weights tensor of shape [input_size, num_units] that +- * multiplies the input. Data types supported: Same as @p input +- * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies +- * the current 'state'. Data types supported: Same as @p input +- * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same +- * as @p input +- * @param[out] output Output tensor of shape [num_units, batch_size]. Data types +- * supported: Same as @p input +- * @param[in,out] hidden_state Output tensor of shape [num_units, batch_size]. Data types +- * supported: Same as @p input +- * @param[in] info Activation layer parameter. +- */ +- void configure(const ICLTensor *input, const ICLTensor *weights, +- const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, +- ICLTensor *output, ActivationLayerInfo &info); +- /** Initialize the function +- * +- * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data +- * types supported: F16/F32 +- * @param[in] weights Weights tensor of shape [input_size, num_units] that multiplies +- * the input. Data types supported: Same as @p input +- * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the +- * current 'state'. Data types supported: Same as @p input +- * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same as @p +- * input +- * @param[in] output Output tensor of shape [num_units, batch_size]. Data types +- * supported: Same as @p input +- * @param[in] hidden_state Output tensor of shape [num_units, batch_size]. Data types +- * supported: Same as @p input +- * @param[in] info Activation layer parameter. +- * +- * @return a status +- */ +- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, +- const ITensorInfo *recurrent_weights, const ITensorInfo *bias, +- const ITensorInfo *hidden_state, const ITensorInfo *output, +- const ActivationLayerInfo &info); +- +- // Inherited methods overridden: +- void run() override; +- void prepare() override; +- +-private: +- MemoryGroup _memory_group; +- CLGEMM _gemm_state_f; +- CLSaturatedArithmeticOperationKernel _add_kernel; +- CLActivationLayerKernel _activation_kernel; +- CLFullyConnectedLayer _fully_connected_kernel; +- CLCopyKernel _copy_kernel; +- CLTensor _fully_connected_out; +- CLTensor _gemm_output; +- CLTensor _add_output; +- bool _is_prepared; +-}; +-} +-#endif /* __ARM_COMPUTE_CLRNN_LAYER_EX_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h +deleted file mode 100644 +index 2090b46..0000000 +--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h ++++ /dev/null +@@ -1,68 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_CLSPACETODEPTH_H__ +-#define __ARM_COMPUTE_CLSPACETODEPTH_H__ +- +-#include "arm_compute/runtime/CL/ICLSimpleFunction.h" +- +-namespace arm_compute +-{ +-class ICLTensor; +- +-/** Basic function to run @ref CLSpaceToDepthKernel +- * +- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. +- * @note The function converts the input tensor to the tensor of the output tensor's type. +- */ +-class CLSpaceToDepth : public ICLSimpleFunction +-{ +-public: +- /** Initialise the kernel's input and output. +- * +- * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. +- * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. +- * @param[block_size] block size integer only +- */ +- void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size); +-}; +- +-} // namespace arm_compute +-#endif /* __ARM_COMPUTE_CLSPACETODEPTH_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h +deleted file mode 100644 +index 03edd15..0000000 +--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h ++++ /dev/null +@@ -1,81 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2017 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-/** +- * @file CLStridedSlice.h +- * @ingroup COM_AI_RUNTIME +- * @brief This file contains arm_compute::CLStridedSlice and arm_compute::CLStridedSliceCPU class +- */ +- +-#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ +-#define __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ +- +-#include "arm_compute/runtime/CL/ICLSimpleFunction.h" +- +-namespace arm_compute +-{ +-class ICLTensor; +- +-/** +- * @brief Class to run @ref CLStridedSliceKernel +- */ +-class CLStridedSliceEx : public ICLSimpleFunction +-{ +-public: +- /** +- * @brief Initialise the kernel's inputs and outputs +- * @param[in] input Tensor input. Data type supported: +- * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 +- * @param[out] output Output tensor. Data type supported: Same as @p input +- * @param[in] beginData 'begin' vector of strided slice operation +- * @param[in] endData 'end' vector of strided slice operation +- * @param[in] stridesData 'strides' vector of strided slice operation +- * @param[in] beginMask If the ith bit is set, begin[i] is ignored +- * @param[in] endMask If the ith bit is set, end[i] is ignored +- * @param[in] shrinkAxisMask If the ith bit is set, the ith specification shrinks the +- * dimensionality by 1, taking on the value at index begin[i] +- * @return N/A +- */ +- void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, +- ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask, +- int32_t shrinkAxisMask); +-}; +-} +-#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h +index 54a697e..5fb102e 100644 +--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h ++++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h +@@ -15,7 +15,7 @@ + */ + + /* +- * Copyright (c) 2017-2018 ARM Limited. ++ * Copyright (c) 2017-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * +@@ -37,16 +37,11 @@ + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +- + #ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ + #define __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ + +-#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h" +-#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h" +- +-#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h" +- +-#include "arm_compute/runtime/CL/CLTensor.h" ++#include "arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h" ++#include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h" + #include "arm_compute/runtime/IFunction.h" + #include "arm_compute/runtime/IMemoryManager.h" + +@@ -54,119 +49,102 @@ + + namespace arm_compute + { +-class ICLTensor; +-/** Function to run the transpose convolution layer. +- * +- * @note This layer was copied in order to fix a bug computing to wrong output dimensions. +- * +- * TransposeConv Layer is the backward pass of Convolution Layer. First we transform the input +- * depending on the stride and pad info and then perform a 1x1 +- * convolution pass. Input stride defines how many zeroes we should put between each element of the +- * input, pad is the amount of padding and finally a is a user +- * specified value where a < stride - 1, that increases the padding top and right of the input +- * image. +- * +- * The relation between input to output is as follows: +- * \f[ +- * width\_output = (width\_input - 1) \cdot stride\_x - \cdot padding\_x + kernel\_x +- * \f] +- * \f[ +- * height\_output = (height\_input - 1) \cdot stride\_y - \cdot padding\_y + kernel\_y +- * \f] +- * +- * where: +- * width_input is the size of the first input dimension. +- * height_input is the size of the second input dimension. +- * width_output is the size of the first output dimension. +- * height_output is the size of the second output dimension. +- * kernel_x and kernel_y are the convolution sizes in x and y. +- * stride_x and stride_y is the input stride of the first and second dimension. +- * +- * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution. +- * Therefore, it will be necessary to use the weights in the +- * reverse order to perform an actual convolution. This is achieved by using the @ref +- * CPPFlipWeightsKernel. +- * +- * This function calls the following OpenCL kernels/functions: +- * +- * -# @ref CLTransposeConvLayerUpsample +- * -# @ref CLConvolutionLayer ++/** Basic function to compute the deconvolution layer. This function calls the following OpenCL ++ * kernels/functions: + * ++ * -# @ref CLGEMMDeconvolutionLayer ++ * -# @ref CLDirectTransposeConvLayer + */ + class CLTransposeConvLayer : public IFunction + { + public: +- /** Constructor */ ++ /** Default constructor */ + CLTransposeConvLayer(std::shared_ptr memory_manager = nullptr); +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- CLTransposeConvLayer(const CLTransposeConvLayer &) = delete; +- /** Default move constructor */ +- CLTransposeConvLayer(CLTransposeConvLayer &&) = default; +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- CLTransposeConvLayer &operator=(const CLTransposeConvLayer &) = delete; +- /** Default move assignment operator */ +- CLTransposeConvLayer &operator=(CLTransposeConvLayer &&) = default; ++ + /** Set the input, weights, biases and output tensors. + * +- * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, +- * and an optional 4th dimension for batch of inputs. +- * Data types supported: QASYMM8/F16/F32. +- * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. +- * Data type supported: Same as @p input. +- * @param[in] bias (Optional) The biases have one dimension. Data type supported: +- * Same as @p input. +- * @param[out] output Output tensor. The output has the same number of dimensions +- * as the @p input. +- * @param[in] info Contains padding and policies to be used in the +- * transpose convolution, this is decribed in @ref PadStrideInfo. +- * @param[in] invalid_right The number of zeros added to right edge of the output. +- * @param[in] invalid_bottom The number of zeros added to top edge of the output. +- * @param[in] weights_info (Optional) Weights information needed for @ref +- * CLConvolutionLayer, specifies if the weights tensor has been +- * reshaped with @ref CLWeightsReshapeKernel. ++ * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an ++ * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. ++ * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type ++ * supported: Same as @p input. ++ * @param[in] bias (Optional) The biases have one dimension. Data type supported: Same ++ * as @p input. ++ * @param[out] output Output tensor. The output has the same number of dimensions as the ++ * @p input. ++ * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this ++ * is described in @ref PadStrideInfo. ++ * @param[in] invalid_right The number of zeros added to right edge of the output. ++ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. ++ * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, ++ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. ++ * + */ + void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, +- const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom, ++ const PadStrideInfo &deconv_info, unsigned int invalid_right, ++ unsigned int invalid_bottom, const WeightsInfo &weights_info = WeightsInfo()); ++ /** Set the input, weights, biases and output tensors. ++ * ++ * @param[in] compile_context The compile context to be used. ++ * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and ++ * an optional 4th dimension for batch of inputs. Data types supported: ++ * QASYMM8_SIGNED/QASYMM8/F16/F32. ++ * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data ++ * type supported: Same as @p input. ++ * @param[in] bias (Optional) The biases have one dimension. Data type supported: ++ * Same as @p input. ++ * @param[out] output Output tensor. The output has the same number of dimensions as ++ * the @p input. ++ * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, ++ * this is described in @ref PadStrideInfo. ++ * @param[in] invalid_right The number of zeros added to right edge of the output. ++ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. ++ * @param[in] weights_info (Optional) Weights information needed for @ref ++ * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref ++ * CLWeightsReshapeKernel. ++ * ++ */ ++ void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, ++ const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info, ++ unsigned int invalid_right, unsigned int invalid_bottom, + const WeightsInfo &weights_info = WeightsInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref +- * CLTransposeConvLayer ++ * CLTransposeConvLayer ++ * ++ * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an ++ * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. ++ * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data ++ * type supported: Same as @p input. ++ * @param[in] bias (Optional) The biases have one dimension. Data type supported: Same as ++ * @p input. ++ * @param[in] output Output tensor info. The output has the same number of dimensions as the ++ * @p input. ++ * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is ++ * described in @ref PadStrideInfo. ++ * @param[in] invalid_right The number of zeros added to right edge of the output. ++ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. ++ * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, ++ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. + * +- * @param[in] input Input tensor info. 3 lower dimensions represent a single input, +- * and an optional 4th dimension for batch of inputs. +- * Data types supported: QASYMM8/F16/F32. +- * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. +- * Data type supported: Same as @p input. +- * @param[in] bias (Optional) The biases have one dimension. Data type supported: +- * Same as @p input. +- * @param[in] output Output tensor info. The output has the same number of dimensions +- * as the @p input. +- * @param[in] info Contains padding and policies to be used in the +- * transpose convolution, this is decribed in @ref PadStrideInfo. +- * @param[in] innvalid_right The number of zeros added to right edge of the output. +- * @param[in] invalid_bottom The number of zeros added to top edge of the output. +- * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, +- * specifies if the weights tensor has been reshaped with @ref +- * CLWeightsReshapeKernel. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, +- const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info, +- unsigned int innvalid_right, unsigned int invalid_bottom, ++ const ITensorInfo *bias, ITensorInfo *output, ++ const PadStrideInfo &deconv_info, unsigned int invalid_right, ++ unsigned int invalid_bottom, + const WeightsInfo &weights_info = WeightsInfo()); + ++ static DeconvolutionMethod ++ get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights, ++ const ITensorInfo *bias, ITensorInfo *output, ++ const PadStrideInfo &deconv_info, unsigned int invalid_right, ++ unsigned int invalid_bottom, const WeightsInfo &weights_info); + // Inherited methods overridden: + void run() override; + void prepare() override; + + private: +- MemoryGroup _memory_group; +- CLTransposeConvLayerUpsample _scale_f; +- CLConvolutionLayer _conv_f; +- CPPFlipWeightsKernel _flip_weights; +- CLTensor _scaled_output; +- ICLTensor *_original_weights; +- CLTensor _weights_flipped; +- bool _is_prepared; ++ std::shared_ptr _memory_manager; ++ std::unique_ptr _function; + }; +-} ++} // namespace arm_compute + #endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h +deleted file mode 100644 +index 7570fe7..0000000 +--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h ++++ /dev/null +@@ -1,102 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2017-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__ +-#define __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__ +- +-#include "arm_compute/runtime/IFunction.h" +- +-#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h" +-#include "arm_compute/core/Types.h" +-#include "arm_compute/runtime/IFunction.h" +-#include "arm_compute/runtime/IMemoryManager.h" +- +-namespace arm_compute +-{ +-class ICLTensor; +- +-/** Basic function to run @ref CLTransposeConvLayerUpsampleKernel */ +-class CLTransposeConvLayerUpsample : public IFunction +-{ +-public: +- /** Default constructor */ +- CLTransposeConvLayerUpsample(); +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- CLTransposeConvLayerUpsample(const CLTransposeConvLayerUpsample &) = delete; +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- CLTransposeConvLayerUpsample &operator=(const CLTransposeConvLayerUpsample &) = delete; +- /** Allow instances of this class to be moved */ +- CLTransposeConvLayerUpsample(CLTransposeConvLayerUpsample &&) = default; +- /** Allow instances of this class to be moved */ +- CLTransposeConvLayerUpsample &operator=(CLTransposeConvLayerUpsample &&) = default; +- /** Default destructor */ +- virtual ~CLTransposeConvLayerUpsample() = default; +- +- /** Initialize the function's source, destination, interpolation type and border_mode. +- * +- * @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32. +- * @param[out] output Destination tensor. Data type supported: same as @p input. +- * @param[in] inner_border The number of zeros added to right and top edges of the input. +- * @param[in] info Contains padding and policies to be used in the deconvolution. +- */ +- void configure(ICLTensor *input, ICLTensor *output, const BorderSize &inner_border, +- const PadStrideInfo &info); +- /** Static function to check if given info will lead to a valid configuration of @ref +- * CLTransposeConvLayerUpsample +- * +- * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32. +- * @param[in] output Destination tensor info. Data type supported: same as @p input. +- * @param[in] inner_border The number of zeros added to right and top edges of the input. +- * @param[in] info Contains padding and policies to be used in the deconvolution. +- * +- * @return a status +- */ +- static Status validate(const ITensorInfo *input, const ITensorInfo *output, +- const BorderSize &inner_border, const PadStrideInfo &info); +- +- // Inherited methods overridden: +- void run() override; +- +-private: +- CLTransposeConvLayerUpsampleKernel _upsample; +- ICLTensor *_output; +-}; +-} +-#endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h b/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h +deleted file mode 100644 +index 666afef..0000000 +--- a/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h ++++ /dev/null +@@ -1,65 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2017-2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_CPPUPSAMPLE_EX_H__ +-#define __ARM_COMPUTE_CPPUPSAMPLE_EX_H__ +- +-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h" +- +-#include "arm_compute/core/Types.h" +- +-namespace arm_compute +-{ +-class ITensor; +- +-/** Basic function to run @ref CPPUpsample */ +-class CPPUpsampleEx : public ICPPSimpleFunction +-{ +-public: +- /** Configure the upsample CPP kernel +- * +- * @param[in] input The input tensor to upsample. Data types supported: F32/F16/QASYMM8 +- * @param[out] output The output tensor. Data types supported: Same as @p input +- * @param[in] info Padding information +- */ +- void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info); +-}; +-} +-#endif /* __ARM_COMPUTE_CPPUPSAMPLE_EX_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h +index 49504fd..3fad230 100644 +--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h ++++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h +@@ -18,20 +18,13 @@ + + #include + #include +-#include +-#include + #include + #include + #include + #include + #include +-#include +-#include + #include +-#include + #include +-#include +-#include + #include + + #endif // __ARM_COMPUTE_NEFUNCTIONSEX_H__ +diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h +deleted file mode 100644 +index f0f0d81..0000000 +--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h ++++ /dev/null +@@ -1,79 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2017-2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_NECAST_H__ +-#define __ARM_COMPUTE_NECAST_H__ +- +-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +- +-#include "arm_compute/core/Types.h" +-#include "arm_compute/core/TypesEx.h" +- +-namespace arm_compute +-{ +-// Forward declarations +-class ITensor; +- +-/** Basic function to run @ref NECastKernel that converts an input tensor to the other types */ +-class NECast : public INESimpleFunctionNoBorder +-{ +-public: +- /** Configure the kernel. +- * +- * @param[in] input Source tensor. Data types supported: U8/S8/QASYMM8/U32/S32/F32. +- * @param[out] output Destination tensor with the same dimensions of input. Data type supported: +- * U8/S8/QASYMM8/U32/S32/F32. +- * @param[in] input_subtype Sub data type of input. +- */ +- void configure(const ITensor *input, ITensor *output, +- SubDataType input_subtype = SubDataType::NONE); +- /** Static function to check if given info will lead to a valid configuration of @ref NECast +- * +- * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32. +- * @param[in] output Output tensor info. Data type supported: U8/S8/QASYMM8/U32/S32/F32. +- * @param[in] input_subtype Sub data type of input. +- * +- * @return a status +- */ +- static Status validate(const ITensorInfo *input, const ITensorInfo *output, +- SubDataType input_subtype = SubDataType::NONE); +-}; +-} // namespace arm_compute +-#endif /* __ARM_COMPUTE_NECAST_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h +deleted file mode 100644 +index 005d85a..0000000 +--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h ++++ /dev/null +@@ -1,78 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__ +-#define __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__ +- +-#include "arm_compute/runtime/IFunction.h" +- +-#include "arm_compute/core/Types.h" +-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +- +-namespace arm_compute +-{ +-class ITensor; +- +-/** Basic function to run @ref NEDepthToSpaceLayerKernelEx. */ +-class NEDepthToSpaceLayerEx : public INESimpleFunctionNoBorder +-{ +-public: +- /** Set the input and output tensors. +- * +- * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: +- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. +- * @param[out] output Tensor output. Data types supported: same as @p input +- * @param[in] block_shape Block shape value. +- */ +- void configure(const ITensor *input, ITensor *output, int32_t block_shape); +- /** Static function to check if given info will lead to a valid configuration of @ref +- * NEDepthToSpaceLayerEx. +- * +- * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: +- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. +- * @param[in] output Tensor output info. Data types supported: same as @p input +- * @param[in] block_shape Block shape x value. +- * +- * @return a status +- */ +- static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape); +-}; +-} // namespace arm_compute +-#endif /* __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h +deleted file mode 100644 +index 27a38e9..0000000 +--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h ++++ /dev/null +@@ -1,70 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2018-2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__ +-#define __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__ +- +-#include "arm_compute/runtime/NEON/INESimpleFunction.h" +- +-namespace arm_compute +-{ +-class ITensor; +- +-/** Basic function to perform negative on an input tensor. */ +-class NENegLayer : public INESimpleFunction +-{ +-public: +- /** Initialize the function +- * +- * @param[in] input Input tensor. Data types supported: F16/F32/S32. +- * @param[out] output Output tensor. Data types supported: same as @p input. +- */ +- void configure(const ITensor *input, ITensor *output); +- /** Static function to check if given info will lead to a valid configuration of @ref NERsqrtLayer +- * +- * @param[in] input First tensor input info. Data types supported: F16/F32/S32. +- * @param[in] output Output tensor info. Data types supported: Same as @p input. +- * +- * @return a status +- */ +- static Status validate(const ITensorInfo *input, const ITensorInfo *output); +-}; +-} // namespace arm_compute +-#endif /* __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h +index 39c57eb..56548a4 100644 +--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h ++++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h +@@ -46,7 +46,7 @@ + #include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h" + #include "arm_compute/core/NEON/kernels/NETransposeKernel.h" + #include "arm_compute/runtime/MemoryGroup.h" +-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h" ++#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" + #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" + #include "arm_compute/runtime/Tensor.h" + +@@ -164,7 +164,7 @@ private: + MemoryGroup _memory_group; + NEFullyConnectedHybridLayerReshapeWeights _reshape_weights_function; + NEQuantizationSymmetricKernel _quant_input_kernel; +- NEGEMMLowpMatrixMultiplyCoreEx _mm_gemmlowp; ++ NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp; + NEMultiplyScaleFactorKernel _multiply_scale_kernel; + NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; + Tensor _reshape_weights_output; +diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h +deleted file mode 100644 +index d844513..0000000 +--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h ++++ /dev/null +@@ -1,170 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2017-2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__ +-#define __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__ +- +-#include "arm_compute/core/NEON/INEKernel.h" +-#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h" +-#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h" +-#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h" +-#include "arm_compute/runtime/IFunction.h" +-#include "arm_compute/runtime/IMemoryManager.h" +-#include "arm_compute/runtime/MemoryGroup.h" +-// #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" +-#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h" +-#include "arm_compute/runtime/Tensor.h" +- +-#include +- +-namespace arm_compute +-{ +-class ITensor; +- +-/** Basic function to execute GEMMLowpMatrixMultiplyCore on NEON. This function calls the following +- * NEON kernels if the DOT product instruction is not available: +- * +- * -# @ref NEGEMMInterleave4x4Kernel +- * -# @ref NEGEMMTranspose1xWKernel +- * -# @ref NEGEMMLowpMatrixMultiplyKernel +- * -# @ref NEGEMMLowpOffsetContributionKernel +- * -# @ref NEActivationLayer +- * +- * otherwise if the DOT product instruction is available: +- * +- * -# @ref NEGEMMLowpOffsetContributionKernel +- * +-*/ +-class NEGEMMLowpMatrixMultiplyCoreEx : public IFunction +-{ +-public: +- /** Constructor */ +- NEGEMMLowpMatrixMultiplyCoreEx(std::shared_ptr memory_manager = nullptr); +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- NEGEMMLowpMatrixMultiplyCoreEx(const NEGEMMLowpMatrixMultiplyCoreEx &) = delete; +- /** Default move constructor */ +- NEGEMMLowpMatrixMultiplyCoreEx(NEGEMMLowpMatrixMultiplyCoreEx &&) = default; +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- NEGEMMLowpMatrixMultiplyCoreEx &operator=(const NEGEMMLowpMatrixMultiplyCoreEx &) = delete; +- /** Default move assignment operator */ +- NEGEMMLowpMatrixMultiplyCoreEx &operator=(NEGEMMLowpMatrixMultiplyCoreEx &&) = default; +- /** Initialise the kernel's inputs, output +- * +- * @note GEMM_LOWP: low precision GEMM kernel +- * This kernel performs the following computations: +- * +- * -# Convert a values from QASYMM8 to int32 and add a_offset to each of them. +- * -# Convert b values from QASYMM8 to int32 add b_offset to each of them. +- * -# Compute the matrix product of the resulting a * b in int32. +- * +- * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is +- * QASYMM8/QASYMM8_SIGNED otherwise +- * +- * @param[in] a First input tensor (Matrix A). Data type supported: +- * QASYMM8/QASYMM8_SIGNED. +- * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a +- * @param[in] c Third input tensor (Matrix C). It can be a nullptr. Data type supported: +- * S32 +- * @param[out] output Output tensor. Data type supported: Data type supported: +- * S32/QASYMM8/QASYMM8_SIGNED +- * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped +- * and +- * if the reshape of matrix B should be executed only for the first run +- */ +- void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, +- const GEMMInfo &gemm_info = GEMMInfo()); +- /** Static function to check if given info will lead to a valid configuration of @ref +- * NEGEMMLowpMatrixMultiplyCoreEx +- * +- * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is +- * QASYMM8/QASYMM8_SIGNED otherwise +- * +- * @param[in] a First input tensor info (Matrix A). Data type supported: +- * QASYMM8/QASYMM8_SIGNED. +- * @param[in] b Second input tensor info (Matrix B). Data type supported: same as @p a +- * @param[in] c Third input tensor info (Matrix C). It can be a nullptr. Data type +- * supported: S32 +- * @param[in] output Output tensor info. Data type supported: Data type supported: +- * S32/QASYMM8/QASYMM8_SIGNED +- * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped +- * and +- * if the reshape of matrix B should be executed only for the first run +- * +- * @return a status +- */ +- static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, +- const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo()); +- +- // Inherited methods overridden +- void run() override; +- void prepare() override; +- +-private: +- MemoryGroup _memory_group; +- NEGEMMAssemblyDispatch _asm_glue; +- std::unique_ptr _mm_kernel; +- std::unique_ptr _mtx_a_reshape_kernel; +- std::unique_ptr _mtx_b_reshape_kernel; +- NEGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel; +- NEGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel; +- NEGEMMLowpOffsetContributionKernel _offset_contribution_kernel; +- NEGEMMLowpOffsetContributionOutputStageKernel _offset_contribution_output_stage_kernel; +- +- Tensor _vector_sum_col; +- Tensor _vector_sum_row; +- Tensor _tmp_a; +- Tensor _tmp_b; +- Tensor _mm_result_s32; +- Tensor _signed_a; +- Tensor _signed_output; +- const ITensor *_original_b; +- int32_t _a_offset; +- int32_t _b_offset; +- +- bool _run_vector_matrix_multiplication; +- bool _assembly_path; +- bool _fused_assembly_path; +- bool _reshape_b_only_on_first_run; +- bool _is_prepared; +- bool _fuse_output_stage; +- bool _flip_signedness; +-}; +-} // namespace arm_compute +-#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h +deleted file mode 100644 +index ca84133..0000000 +--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h ++++ /dev/null +@@ -1,63 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2018-2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_NEPRELU_H__ +-#define __ARM_COMPUTE_NEPRELU_H__ +- +-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +- +-namespace arm_compute +-{ +-class ITensor; +- +-/** Basic function to run @ref NEPReLUKernel */ +-class NEPReLU : public INESimpleFunctionNoBorder +-{ +-public: +- /** Initialise the kernel's inputs and output +- * +- * @param[in] input. Data types supported: QASYMM8/F32. +- * @param[in] alpha. Data types supported: Same as @p input. +- * @param[out] output Output tensor. Data types supported: Same as @p input. +- */ +- void configure(const ITensor *input, const ITensor *alpha, ITensor *output); +-}; +-} // namespace arm_compute +-#endif /* __ARM_COMPUTE_NEPRELU_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h +deleted file mode 100644 +index 8a7b179..0000000 +--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h ++++ /dev/null +@@ -1,130 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_NERNNLAYER_EX_H__ +-#define __ARM_COMPUTE_NERNNLAYER_EX_H__ +- +-#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h" +-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h" +-#include "arm_compute/core/NEON/kernels/NECopyKernel.h" +- +-#include "arm_compute/core/Types.h" +-#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h" +-#include "arm_compute/runtime/NEON/functions/NEGEMM.h" +- +-namespace arm_compute +-{ +-// Forward declarations +-class ITensor; +- +-/** Basic function to run @ref NERNNLayerEx */ +-class NERNNLayerEx : public IFunction +-{ +-public: +- /** Default constructor */ +- NERNNLayerEx(std::shared_ptr memory_manager = nullptr); +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- NERNNLayerEx(const NERNNLayerEx &) = delete; +- /** Default move constructor */ +- NERNNLayerEx(NERNNLayerEx &&) = default; +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- NERNNLayerEx &operator=(const NERNNLayerEx &) = delete; +- /** Default move assignment operator */ +- NERNNLayerEx &operator=(NERNNLayerEx &&) = default; +- /** Initialize the function +- * +- * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data +- * types supported: F16/F32 +- * @param[in] weights Weights tensor of shape [input_size, num_units] that +- * multiplies the input. Data types supported: Same as @p input +- * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies +- * the current 'state'. Data types supported: Same as @p input +- * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same +- * as @p input +- * @param[out] output Output tensor of shape [num_units, batch_size]. Data types +- * supported: Same as @p input +- * @param[in,out] hidden_state Output tensor of shape [num_units, batch_size]. Data types +- * supported: Same as @p input +- * @param[in] info Activation layer parameter. +- */ +- void configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights, +- const ITensor *bias, ITensor *hidden_state, ITensor *output, +- ActivationLayerInfo &info); +- /** Initialize the function +- * +- * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data +- * types supported: F16/F32 +- * @param[in] weights Weights tensor of shape [input_size, num_units] that multiplies +- * the input. Data types supported: Same as @p input +- * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the +- * current 'state'. Data types supported: Same as @p input +- * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same as @p +- * input +- * @param[in] output Output tensor of shape [num_units, batch_size]. Data types +- * supported: Same as @p input +- * @param[in] hidden_state Output tensor of shape [num_units, batch_size]. Data types +- * supported: Same as @p input +- * @param[in] info Activation layer parameter. +- * +- * @return a status +- */ +- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, +- const ITensorInfo *recurrent_weights, const ITensorInfo *bias, +- const ITensorInfo *hidden_state, const ITensorInfo *output, +- const ActivationLayerInfo &info); +- +- // Inherited methods overridden: +- void run() override; +- void prepare() override; +- +-private: +- MemoryGroup _memory_group; +- NEGEMM _gemm_state_f; +- NEArithmeticAdditionKernel _add_kernel; +- NEActivationLayerKernel _activation_kernel; +- NEFullyConnectedLayer _fully_connected_kernel; +- NECopyKernel _copy_kernel; +- Tensor _fully_connected_out; +- Tensor _gemm_output; +- Tensor _add_output; +- bool _is_prepared; +-}; +-} // namespace arm_compute +-#endif /* __ARM_COMPUTE_NERNNLAYER_EX_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h +deleted file mode 100644 +index 03ac457..0000000 +--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h ++++ /dev/null +@@ -1,99 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__ +-#define __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__ +- +-#include "arm_compute/runtime/IFunction.h" +- +-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" +-#include "arm_compute/core/Types.h" +-#include "arm_compute/runtime/MemoryGroup.h" +-#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" +-#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" +- +-namespace arm_compute +-{ +-class ITensor; +- +-/** Basic function to perform reduce operation */ +-class NEReduceMeanEx : public IFunction +-{ +-public: +- /** Constructor */ +- NEReduceMeanEx(std::shared_ptr memory_manager = nullptr); +- /** Configure kernel +- * +- * @note Supported tensor rank: up to 4 +- * +- * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 +- * @param[in] reduction_axis Reduction axis vector. +- * @param[in] keep_dims If positive, retains reduced dimensions with length 1. +- * @param[out] output Destination tensor. Data type supported: Same as @p input +- */ +- void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, +- ITensor *output); +- +- /** Static function to check if given info will lead to a valid configuration of @ref +- * NEReduceMeanEx +- * +- * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 +- * @param[in] reduction_axis Reduction axis vector. +- * @param[in] keep_dims If positive, retains reduced dimensions with length 1. +- * @param[in] output Destination tensor. Data type supported: Same as @p input +- * +- * @return A status +- */ +- static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, +- bool keep_dims, const ITensorInfo *output); +- +- // Inherited methods overridden: +- void run() override; +- +-private: +- MemoryGroup _memory_group; +- std::unique_ptr _reduction_kernels{nullptr}; +- std::unique_ptr _reduced_outs{nullptr}; +- NEReshapeLayer _reshape; +- unsigned int _reduction_ops; +- bool _keep_dims; +-}; +-} // namespace arm_compute +-#endif /* __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h +deleted file mode 100644 +index 3b695fb..0000000 +--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h ++++ /dev/null +@@ -1,136 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__ +-#define __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__ +- +-#include "arm_compute/runtime/IFunction.h" +- +-#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h" +-#include "arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h" +-#include "arm_compute/core/Types.h" +- +-namespace arm_compute +-{ +-class ITensor; +- +-/** Basic function to spatial divide a tensor. This function calls the following NEON +- * kernels/functions: +- * +- * -# @ref NEMemsetKernel +- * -# @ref NESpaceToBatchLayerKernel +- */ +-class NESpaceToBatchLayerEx : public IFunction +-{ +-public: +- /** Default constructor */ +- NESpaceToBatchLayerEx(); +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- NESpaceToBatchLayerEx(const NESpaceToBatchLayerEx &) = delete; +- /** Prevent instances of this class from being copied (As this class contains pointers) */ +- NESpaceToBatchLayerEx &operator=(const NESpaceToBatchLayerEx &) = delete; +- /** Allow instances of this class to be moved */ +- NESpaceToBatchLayerEx(NESpaceToBatchLayerEx &&) = default; +- /** Allow instances of this class to be moved */ +- NESpaceToBatchLayerEx &operator=(NESpaceToBatchLayerEx &&) = default; +- /** Default destructor */ +- virtual ~NESpaceToBatchLayerEx() = default; +- /** Set the input and output tensors. +- * +- * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: +- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. +- * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32 +- * @param[in] paddings 2-D tensor with shape [2, M]. Data types supported: S32 +- * @param[out] output Tensor output. Data types supported: same as @p input +- */ +- void configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, +- ITensor *output); +- /** Set the input and output tensors. (Static block shape and paddings) +- * +- * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: +- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. +- * @param[in] block_shape_x Block shape x value. +- * @param[in] block_shape_y Block shape y value. +- * @param[in] padding_left The left padding of the output tensor. +- * @param[in] padding_right The right padding of the output tensor. +- * @param[out] output Tensor output. Data types supported: same as @p input +- */ +- void configure(const ITensor *input, const int block_shape_x, const int block_shape_y, +- const Size2D &padding_left, const Size2D &padding_right, ITensor *output); +- /** Static function to check if given info will lead to a valid configuration of @ref +- * NESpaceToBatchLayerEx +- * +- * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: +- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. +- * @param[in] block_shape block shape tensor info with shape [M]. Data types supported: S32 +- * @param[in] paddings paddings tensor info with shape [2, M]. Data types supported: S32 +- * @param[in] output Tensor output info. Data types supported: same as @p input +- * +- * @return a status +- */ +- static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, +- const ITensorInfo *paddings, const ITensorInfo *output); +- /** Static function to check if given info will lead to a valid configuration of @ref +- * NESpaceToBatchLayerEx (Static block shape and paddings) +- * +- * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: +- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. +- * @param[in] block_shape_x Block shape x value. +- * @param[in] block_shape_y Block shape y value. +- * @param[in] padding_left The left padding of the output tensor. +- * @param[in] padding_right The right padding of the output tensor. +- * @param[in] output Tensor output info. Data types supported: same as @p input +- * +- * @return a status +- */ +- static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, +- const Size2D &padding_left, const Size2D &padding_right, +- const ITensorInfo *output); +- +- // Inherited methods overridden: +- void run() override; +- +-private: +- NESpaceToBatchLayerKernel _space_to_batch_kernel; /**< SpaceToBatch kernel to run */ +- NEMemsetKernel _memset_kernel; /**< Memset kernel to run */ +- bool _has_padding; /**< Flag to check if the output has padding */ +-}; +-} // namespace arm_compute +-#endif /* __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h +deleted file mode 100644 +index 9f32616..0000000 +--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h ++++ /dev/null +@@ -1,79 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#ifndef __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__ +-#define __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__ +- +-#include "arm_compute/core/Types.h" +-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +- +-namespace arm_compute +-{ +-class ITensor; +- +-/** This function calls the following NEON kernels/functions: +- * +- * -# @ref NESpaceToDepthLayerKernelEx +- */ +-class NESpaceToDepthLayerEx : public INESimpleFunctionNoBorder +-{ +-public: +- /** Set the input and output tensors. +- * +- * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: +- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. +- * @param[out] output Tensor output. Data types supported: same as @p input +- * @param[in] block_shape Block shape value +- */ +- void configure(const ITensor *input, ITensor *output, int32_t block_shape); +- /** Static function to check if given info will lead to a valid configuration of @ref +- * NESpaceToDepthLayerEx (Static block shape and paddings) +- * +- * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: +- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. +- * @param[in] output Tensor output info. Data types supported: same as @p input +- * @param[in] block_shape Block shape value +- * +- * @return a status +- */ +- static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape); +-}; +-} // namespace arm_compute +-#endif /* __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__ */ +diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h +index 408d150..24ff5da 100644 +--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h ++++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h +@@ -15,7 +15,7 @@ + */ + + /* +- * Copyright (c) 2017-2019 ARM Limited. ++ * Copyright (c) 2017-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * +@@ -37,16 +37,14 @@ + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +- + #ifndef __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ + #define __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ + +-#include "arm_compute/runtime/CPP/functions/CPPUpsampleEx.h" ++#include "arm_compute/runtime/CPP/functions/CPPUpsample.h" + #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h" + #include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h" +-#include "arm_compute/runtime/NEON/functions/NEPermute.h" ++#include "arm_compute/runtime/NEON/functions/NEReverse.h" + +-#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h" + #include "arm_compute/core/Types.h" + #include "arm_compute/runtime/IFunction.h" + #include "arm_compute/runtime/IMemoryManager.h" +@@ -59,8 +57,8 @@ namespace arm_compute + { + /** Function to run the deconvolution layer. + * +- * Transpose convolution Layer is the backward pass of Convolution Layer. First we transform the +- * input depending on the stride and pad info and then perfrom a 1x1 ++ * Deconvolution Layer is the backward pass of Convolution Layer. First we transform the input ++ * depending on the stride and pad info and then perfrom a 1x1 + * convolution pass. Input stride defines how many zeroes we should put between each element of the + * input, pad is the amount of padding and finaly a is a user + * specified value where a < stride - 1 that increases the padding top and right of the input image. +@@ -81,21 +79,22 @@ namespace arm_compute + * kernel_x and kernel_y are the convolution sizes in x and y. + * stride_x and stride_y is the input stride of the first and second dimension. + * +- * The weights used by Transpose convolution are supposed to be the same as the ones used for +- * Convolution. Therefore, it will be necessary to use the weights in the +- * reverse order to perform an actual convolution. This is achieved by using the @ref +- * CPPFlipWeightsKernel. ++ * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution. ++ * Therefore, it will be necessary to use the weights in the ++ * reverse order to perform an actual convolution. This is achieved by using @ref NEReverse. + * + * This function calls the following NEON kernels/functions: + * +- * -# @ref CPPUpsample ++ * -# @ref CPPUpsampleEx + * -# @ref NEConvolutionLayer ++ * -# @ref NEPermute ++ * -# @ref NEReverse + * + */ + class NETransposeConvLayer : public IFunction + { + public: +- /** Default constructor */ ++ /** Constructor */ + NETransposeConvLayer(std::shared_ptr memory_manager = nullptr); + + /** Prevent instances of this class from being copied (As this class contains pointers) */ +@@ -112,37 +111,38 @@ public: + /** Set the input, weights, biases and output tensors. + * + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an +- * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8. ++ * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type +- * supported: Same as @p input. ++ * supported: Same as @p input. + * @param[in] bias Optional, ignored if NULL. The biases have one dimension. Data type +- * supported: Data types supported: S32 for QASYMM8 input, F32 for F32 input, F16 for F16 input. ++ * supported: Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 ++ * for F16 input. + * @param[out] output Output tensor. The output has the same number of dimensions as the @p +- * input. ++ * input. + * @param[in] info Contains padding and policies to be used in the deconvolution, this is +- * decribed in @ref PadStrideInfo. +- * @param[in] invalid_right The number of zeros added to right edge of the output. +- * @param[in] invalid_bottom The number of zeros added to top edge of the output. ++ * decribed in @ref PadStrideInfo. ++ * @param[in] invalid_right The number of zeros added to right edge of the output. ++ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * + */ + void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom); + /** Static function to check if given info will lead to a valid configuration of @ref +- * NETransposeConvLayer ++ * NETransposeConvLayer + * + * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an +- * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8. ++ * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. + * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type +- * supported: Same as @p input. ++ * supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. Data type supported: Data types +- * supported: S32 for QASYMM8 input, F32 for F32 input, F16 for F16 input. ++ * supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input. + * @param[in] output Output tensor info. The output has the same number of dimensions as the @p +- * input. ++ * input. + * @param[in] info Contains padding and policies to be used in the deconvolution, this is +- * decribed in @ref PadStrideInfo. +- * @param[in] innvalid_right The number of zeros added to right edge of the output. +- * @param[in] invalid_bottom The number of zeros added to top edge of the output. ++ * decribed in @ref PadStrideInfo. ++ * @param[in] innvalid_right The number of zeros added to right edge of the output. ++ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * + * @return a status + */ +@@ -158,17 +158,11 @@ public: + private: + MemoryGroup _memory_group; + NEConvolutionLayer _conv_f; +- CPPUpsampleEx _upsample_f; +- CPPFlipWeightsKernel _flip_weights; +- NEPermute _permute_input; +- NEPermute _permute_weights; +- NEPermute _permute_output; ++ CPPUpsample _upsample_f; ++ NEReverse _flip_weights; + Tensor _scaled_output; + Tensor _weights_flipped; +- Tensor _permuted_input; +- Tensor _permuted_weights; +- Tensor _permuted_output; +- bool _is_nchw; ++ Tensor _flip_axis; + const ITensor *_original_weights; + ITensor *_input; + PadStrideInfo _info; +diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp +index 7b6b974..ba42a24 100644 +--- a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp ++++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp +@@ -55,16 +55,7 @@ using namespace arm_compute; + + const std::map CLKernelLibraryEx::_kernel_program_map = { + // ARMComputeEx kernels +- {"arg_op", "arg_operation.cl"}, +- {"arithmetic_add_qasymm8", "arithmetic_op_quantized.cl"}, + {"binary_logical_op", "binary_logical_op.cl"}, +- {"cast", "cast.cl"}, +- {"cast_qasymm_in", "cast.cl"}, +- {"cast_qasymm_out", "cast.cl"}, +- {"comparison_op", "comparison_op.cl"}, +- {"comparison_op_qasymm8", "comparison_op_quantized.cl"}, +- {"depth_to_space_nchw", "depth_to_space.cl"}, +- {"depth_to_space_nhwc", "depth_to_space.cl"}, + {"embedding_lookup", "embedding_lookup.cl"}, + {"gather_ex", "gather_ex.cl"}, + {"gather_ex_1d", "gather_ex.cl"}, +@@ -74,10 +65,6 @@ const std::map CLKernelLibraryEx::_kernel_program_map + {"instance_normalization_ex", "instance_normalization_ex.cl"}, + {"multiply_scale_factor", "multiply_scale_factor.cl"}, + {"neg_tensor", "neg_tensor.cl"}, +- {"permute_generic", "permute_ex.cl"}, +- {"pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl"}, +- {"prelu", "prelu.cl"}, +- {"prelu_qasymm8", "prelu_quantized.cl"}, + {"quantization_symm8", "quantization_symm8.cl"}, + {"reduce_min_max", "reduce_operation.cl"}, + {"reduce_sum_mean", "reduce_operation.cl"}, +@@ -91,29 +78,15 @@ const std::map CLKernelLibraryEx::_kernel_program_map + {"radixsort_reorder", "topkv2_radixsort.cl"}, + {"topkv2_quicksort", "topkv2_quicksort.cl"}, + {"scale_factor_symm8", "scale_factor.cl"}, +- {"space_to_depth_nchw", "space_to_depth.cl"}, +- {"space_to_depth_nhwc", "space_to_depth.cl"}, + }; + + const std::map CLKernelLibraryEx::_program_source_map = { + #ifdef EMBEDDED_KERNELS + { +- "arg_operation.cl", +-#include "./cl_kernels/arg_operation.clembed" +- }, +- { +- "cast.cl", +-#include "./cl_kernels/cast.clembed" +- }, +- { + "embedding_lookup.cl", + #include "./cl_kernels/embedding_lookup.clembed" + }, + { +- "depth_to_space.cl", +-#include "./cl_kernels/depth_to_space.clembed" +- }, +- { + "gather_ex.cl", + #include "./cl_kernels/gather_ex.clembed" + }, +@@ -150,14 +123,6 @@ const std::map CLKernelLibraryEx::_program_source_map + #include "./cl_kernels/neg_tensor.clembed" + }, + { +- "prelu.cl", +-#include "./cl_kernels/prelu.clembed" +- }, +- { +- "prelu_quantized.cl", +-#include "./cl_kernels/prelu_quantized.clembed" +- }, +- { + "quantization_symm8.cl", + #include "./cl_kernels/quantization_symm8.clembed" + }, +@@ -170,10 +135,6 @@ const std::map CLKernelLibraryEx::_program_source_map + #include "./cl_kernels/scale_factor.clembed" + }, + { +- "space_to_depth.cl", +-#include "./cl_kernels/space_to_depth.clembed" +- }, +- { + "topkv2.cl", + #include "./cl_kernels/topkv2.clembed" + }, +diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl +deleted file mode 100644 +index 03717cf..0000000 +--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl ++++ /dev/null +@@ -1,137 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2017 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "helpers.h" +- +-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) +-/** Perform arg_max/arg_min +- * +- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. +- * e.g. -DDATA_TYPE=short +- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. +- * e.g. -DDEPTH_OUT=16 +- * @attention Operation type(code) specifying which operation to perform should be passed as +- * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1 +- * +- * @param[in] input_ptr Pointer to the source image. Supported data +- * types: +- * U8/QASYMM8/S8/U16/S16/F16/U32/S32/F32 +- * @param[in] input_stride_x Stride of the source image in X dimension +- * (in bytes) +- * @param[in] input_step_x input_stride_x * number of elements along X +- * processed per workitem(in bytes) +- * @param[in] input_stride_y Stride of the source image in Y dimension +- * (in bytes) +- * @param[in] input_step_y input_stride_y * number of elements along Y +- * processed per workitem(in bytes) +- * @param[in] input_stride_z Stride of the source tensor in Z dimension +- * (in bytes) +- * @param[in] input_step_z input_stride_z * number of elements along Z +- * processed per workitem(in bytes) +- * @param[in] input_offset_first_element_in_bytes The offset of the first element +- * in the source image +- * @param[in] input_stride_w Stride of the source tensor in W dimension +- * (in bytes) +- * @param[in] input_step_w output_stride_w * number of elements along W +- * processed per workitem(in bytes) +- * @param[out] output_ptr Pointer to the destination image. +- * Supported data types: U32 +- * @param[in] output_stride_x Stride of the destination image in X dimension +- * (in bytes) +- * @param[in] output_step_x output_stride_x * number of elements along X +- * processed per workitem(in bytes) +- * @param[in] output_stride_y Stride of the destination image in Y dimension +- * (in bytes) +- * @param[in] output_step_y output_stride_y * number of elements along Y +- * processed per workitem(in bytes) +- * @param[in] output_stride_z Stride of the source tensor in Z dimension +- * (in bytes) +- * @param[in] output_step_z output_stride_z * number of elements along Z +- * processed per workitem(in bytes) +- * @param[in] output_stride_w Stride of the source tensor in W dimension +- * (in bytes) +- * @param[in] output_step_w output_stride_w * number of elements along W +- * processed per workitem(in bytes) +- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the +- * destination image +- * @param[in] axis Axis through which reduction occurs +- * @param[in] dim Dimension across the axis to be reduced. +- */ +- +-__kernel void arg_op(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), const int axis, +- const int dim) +-{ +- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); +- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); +- +- int indices[4] = { +- get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT, +- get_global_id(2) / DEPTH_OUT, +- }; +- +- DATA_TYPE value = +- *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); +- DATA_TYPE tval = value; +- int idx = 0; +- for (int i = 1; i < dim; ++i) +- { +- indices[axis] = i; +- +-#if OP_CODE == 1 // ArgMax +- value = max(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], +- indices[2], indices[3]))); +-#elif OP_CODE == 2 // ArgMin +- value = min(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], +- indices[2], indices[3]))); +-#else +- return; +- +-#endif +- +- if (tval != value) +- { +- idx = indices[axis]; +- tval = value; +- } +- } +- +- *((__global uint *)out.ptr) = idx; +-} +-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) +diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl +deleted file mode 100644 +index f74c1c1..0000000 +--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl ++++ /dev/null +@@ -1,191 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016, 2017 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "helpers_asymm.h" +- +-#ifdef SATURATE +-#define ADD(x, y) add_sat((x), (y)) +-#define SUB(x, y) sub_sat((x), (y)) +-#else /* SATURATE */ +-#define ADD(x, y) (x) + (y) +-#define SUB(x, y) (x) - (y) +-#endif /* SATURATE */ +- +-/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to +- * QASYMM8 +- * +- * The following computations will be performed: +- * +- * -# Add offset terms to inputs +- -# Get scaled value of two inputs +- * -# Add inputs +- * -# Add offset terms to final result +- * -# Multiply each entry of result by result_mult_int +- * -# Shift the int32 accumulator by result_shift +- * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. +- * +- * @attention The inputs and output data types need to be passed at compile time using +- * -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: +- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar +- * @attention The number of bits to shift left of input tensors must be passed at compile time using +- * -DLEFT_SHIFT +- * @attention The offset, scalar scale factor and number of bits to shift right of input tensors +- * must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT, +- -DIN2_OFFSET, +- * -RIN2_MULT_INT and -DIN2_SHIFT +- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor +- * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and +- -DRESULT_SHIFT +- * +- * @attention The input and output data_types need to be passed at compile time using +- * -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: +- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar +- * @attention The inputs and output scale information of qasymm8 need to be passed at compile time +- * using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT: +- * e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f +- * @attention The inputs and output scale offset need to be passed at compile time using +- * -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT: +- * e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0 +- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. +- * -DVEC_SIZE=16 +- * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise +- * wrapping policy will be used. +- * +- * @param[in] in1_ptr Pointer to the source tensor. +- * Supported data types: QASYMM8 +- * @param[in] in1_stride_x Stride of the source tensor in X dimension +- * (in bytes) +- * @param[in] in1_step_x in1_stride_x * number of elements along X processed +- * per workitem(in bytes) +- * @param[in] in1_stride_y Stride of the source tensor in Y dimension +- * (in bytes) +- * @param[in] in1_step_y in1_stride_y * number of elements along Y processed +- * per workitem(in bytes) +- * @param[in] in1_stride_z Stride of the source tensor in Z dimension +- * (in bytes) +- * @param[in] in1_step_z in1_stride_z * number of elements along Z processed +- * per workitem(in bytes) +- * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source +- * tensor +- * @param[in] in2_ptr Pointer to the source tensor. Supported data types: +- * QASYMM8 +- * @param[in] in2_stride_x Stride of the source tensor in X dimension +- * (in bytes) +- * @param[in] in2_step_x in2_stride_x * number of elements along X processed +- * per workitem(in bytes) +- * @param[in] in2_stride_y Stride of the source tensor in Y dimension +- * (in bytes) +- * @param[in] in2_step_y in2_stride_y * number of elements along Y processed +- * per workitem(in bytes) +- * @param[in] in2_stride_z Stride of the source tensor in Z dimension +- * (in bytes) +- * @param[in] in2_step_z in2_stride_z * number of elements along Z processed +- * per workitem(in bytes) +- * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source +- * tensor +- * @param[out] out_ptr Pointer to the destination tensor. +- * Supported data types: QASYMM8 +- * @param[in] out_stride_x Stride of the destination tensor in X dimension +- * (in bytes) +- * @param[in] out_step_x out_stride_x * number of elements along X processed +- * per workitem(in bytes) +- * @param[in] out_stride_y Stride of the destination tensor in Y dimension +- * (in bytes) +- * @param[in] out_step_y out_stride_y * number of elements along Y processed +- * per workitem(in bytes) +- * @param[in] out_stride_z Stride of the source tensor in Z dimension +- * (in bytes) +- * @param[in] out_step_z out_stride_z * number of elements along Z processed +- * per workitem(in bytes) +- * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination +- * tensor +- */ +-__kernel void arithmetic_add_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2), +- TENSOR3D_DECLARATION(out)) +-{ +- // Get pixels pointer +- Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); +- Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); +- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); +- +- // Load data +- VEC_DATA_TYPE(int, 16) +- in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16)); +- VEC_DATA_TYPE(int, 16) +- in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16)); +- +- // Get scaled value of two inputs +- VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET); +- VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET); +- +- VEC_DATA_TYPE(int, 16) +- left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT); +- VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift; +- VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift; +- +- VEC_DATA_TYPE(int, 16) +- scaled_in1_val = +- ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16); +- VEC_DATA_TYPE(int, 16) +- scaled_in2_val = +- ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16); +- +- // Add inputs and multiply with a multiplier smaller than 1 +- VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val; +- VEC_DATA_TYPE(int, 16) +- out_val = +- ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16); +- out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET); +- +- VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16)); +- +- // TODO: Apply min-max BOUND to support fuse with relu. +- /* +- #if defined(MIN_BOUND) +- res = max(res, (uchar16)MIN_BOUND); +- #endif // defined(MIN_BOUND) +- #if defined(MAX_BOUND) +- res = min(res, (uchar16)MAX_BOUND); +- #endif // defined(MAX_BOUND) +- */ +- +- // Store result +- VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr); +-} +diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl +deleted file mode 100644 +index 4147a00..0000000 +--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl ++++ /dev/null +@@ -1,233 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2017 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "helpers.h" +- +-#ifndef SCALE +-#define SCALE 1.0f +-#endif +-#ifndef OFFSET +-#define OFFSET 0 +-#endif +-#ifndef VEC_SIZE +-#define VEC_SIZE 1 +-#endif +- +-#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) +-/** Perform a cast operation on an input tensor. +- * +- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and +- * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int +- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. +- * -DVEC_SIZE=16 +- * @attention -DBOOL_INPUT : Whether type of input is bool. +- * +- * @param[in] input_ptr Pointer to the source image. Supported data +- * types: F16/F32 +- * @param[in] input_stride_x Stride of the source image in X dimension (in +- * bytes) +- * @param[in] input_step_x input_stride_x * number of elements along X +- * processed per workitem(in bytes) +- * @param[in] input_stride_y Stride of the source image in Y dimension (in +- * bytes) +- * @param[in] input_step_y input_stride_y * number of elements along Y +- * processed per workitem(in bytes) +- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in +- * bytes) +- * @param[in] input_step_z input_stride_z * number of elements along Z +- * processed per workitem(in bytes) +- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source +- * image +- * @param[out] output_ptr Pointer to the destination image. Supported data +- * types: same as @p input_ptr +- * @param[in] output_stride_x Stride of the destination image in X dimension +- * (in bytes) +- * @param[in] output_step_x output_stride_x * number of elements along X +- * processed per workitem(in bytes) +- * @param[in] output_stride_y Stride of the destination image in Y dimension +- * (in bytes) +- * @param[in] output_step_y output_stride_y * number of elements along Y +- * processed per workitem(in bytes) +- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in +- * bytes) +- * @param[in] output_step_z output_stride_z * number of elements along Z +- * processed per workitem(in bytes) +- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the +- * destination image +- */ +-__kernel void cast(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) +-{ +- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); +- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); +- +- VSTORE(VEC_SIZE) +- (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr), +- VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), +- 0, (__global DATA_TYPE_OUT *)output.ptr); +- VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE) +- res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr), +- VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)); +-#if defined(BOOL_INPUT) +- VEC_DATA_TYPE(char, VEC_SIZE) tmp = CONVERT(res, VEC_DATA_TYPE(char, VEC_SIZE)); +- VEC_DATA_TYPE(char, VEC_SIZE) mask = (VEC_DATA_TYPE(char, VEC_SIZE))(1); +- res = CONVERT(tmp & mask, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)); +-#endif // defined(BOOL_INPUT) +- +- VSTORE(VEC_SIZE)(res, 0, (__global DATA_TYPE_OUT *)output.ptr); +-} +- +-/** Perform a cast operation on an QASYMM8 input tensor. +- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and +- * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int +- * @attention Offset and Scale of input should be given as a preprocessor argument using +- * -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5 +- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. +- * -DVEC_SIZE=16 +- * +- * @param[in] input_ptr Pointer to the source image. Supported data +- * types: F16/F32 +- * @param[in] input_stride_x Stride of the source image in X dimension (in +- * bytes) +- * @param[in] input_step_x input_stride_x * number of elements along X +- * processed per workitem(in bytes) +- * @param[in] input_stride_y Stride of the source image in Y dimension (in +- * bytes) +- * @param[in] input_step_y input_stride_y * number of elements along Y +- * processed per workitem(in bytes) +- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in +- * bytes) +- * @param[in] input_step_z input_stride_z * number of elements along Z +- * processed per workitem(in bytes) +- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source +- * image +- * @param[out] output_ptr Pointer to the destination image. Supported data +- * types: same as @p input_ptr +- * @param[in] output_stride_x Stride of the destination image in X dimension +- * (in bytes) +- * @param[in] output_step_x output_stride_x * number of elements along X +- * processed per workitem(in bytes) +- * @param[in] output_stride_y Stride of the destination image in Y dimension +- * (in bytes) +- * @param[in] output_step_y output_stride_y * number of elements along Y +- * processed per workitem(in bytes) +- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in +- * bytes) +- * @param[in] output_step_z output_stride_z * number of elements along Z +- * processed per workitem(in bytes) +- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the +- * destination image +- */ +-__kernel void cast_qasymm_in(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) +-{ +- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); +- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); +- +- VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) +- in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); +- VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET); +- VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE); +- +- VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset; +- VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale; +- +- VSTORE(VEC_SIZE) +- (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, +- (__global DATA_TYPE_OUT *)output.ptr); +-} +- +-/** Perform a cast operation on an QASYMM8 output tensor. +- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and +- * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int +- * @attention Offset and Scale of output should be given as a preprocessor argument using +- * -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5 +- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. +- * -DVEC_SIZE=16 +- * +- * @param[in] input_ptr Pointer to the source image. Supported data +- * types: F16/F32 +- * @param[in] input_stride_x Stride of the source image in X dimension (in +- * bytes) +- * @param[in] input_step_x input_stride_x * number of elements along X +- * processed per workitem(in bytes) +- * @param[in] input_stride_y Stride of the source image in Y dimension (in +- * bytes) +- * @param[in] input_step_y input_stride_y * number of elements along Y +- * processed per workitem(in bytes) +- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in +- * bytes) +- * @param[in] input_step_z input_stride_z * number of elements along Z +- * processed per workitem(in bytes) +- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source +- * image +- * @param[out] output_ptr Pointer to the destination image. Supported data +- * types: U8 +- * @param[in] output_stride_x Stride of the destination image in X dimension +- * (in bytes) +- * @param[in] output_step_x output_stride_x * number of elements along X +- * processed per workitem(in bytes) +- * @param[in] output_stride_y Stride of the destination image in Y dimension +- * (in bytes) +- * @param[in] output_step_y output_stride_y * number of elements along Y +- * processed per workitem(in bytes) +- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in +- * bytes) +- * @param[in] output_step_z output_stride_z * number of elements along Z +- * processed per workitem(in bytes) +- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the +- * destination image +- */ +-__kernel void cast_qasymm_out(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) +-{ +- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); +- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); +- +- VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) +- in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); +- VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET); +- VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE); +- +- VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale; +- VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE)); +- +- VSTORE(VEC_SIZE) +- (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, +- (__global DATA_TYPE_OUT *)output.ptr); +-} +-#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) +diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl +deleted file mode 100644 +index 0285c95..0000000 +--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl ++++ /dev/null +@@ -1,185 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016, 2017 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "helpers.h" +- +-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) +-/** Perform space to depth rearrangement of tensor +- * +- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float +- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. +- * e.g. -DDEPTH_OUT=16 +- * @attention The value of the z-axis of output tensor should be given as a preprocessor argument +- * using -DZ_OUT=size. e.g. -DZ_OUT=16 +- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. +- * -DBLOCK_SIZE=1 +- * +- * @param[in] input_ptr Pointer to the source image. Supported data +- * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 +- * @param[in] input_stride_x Stride of the source image in X dimension (in +- * bytes) +- * @param[in] input_step_x input_stride_x * number of elements along X +- * processed per workitem(in bytes) +- * @param[in] input_stride_y Stride of the source image in Y dimension (in +- * bytes) +- * @param[in] input_step_y input_stride_y * number of elements along Y +- * processed per workitem(in bytes) +- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in +- * bytes) +- * @param[in] input_step_z input_stride_z * number of elements along Z +- * processed per workitem(in bytes) +- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source +- * image +- * @param[out] output_ptr Pointer to the destination image. Supported data +- * types: same as @p input_ptr +- * @param[in] output_stride_x Stride of the destination image in X dimension +- * (in bytes) +- * @param[in] output_step_x output_stride_x * number of elements along X +- * processed per workitem(in bytes) +- * @param[in] output_stride_y Stride of the destination image in Y dimension +- * (in bytes) +- * @param[in] output_step_y output_stride_y * number of elements along Y +- * processed per workitem(in bytes) +- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in +- * bytes) +- * @param[in] output_step_z output_stride_z * number of elements along Z +- * processed per workitem(in bytes) +- * @param[in] output_stride_w Stride of the source tensor in W dimension (in +- * bytes) +- * @param[in] output_step_w output_stride_w * number of elements along W +- * processed per workitem(in bytes) +- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the +- * destination image +- */ +-__kernel void depth_to_space_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) +-{ +- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); +- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT); +- +- int out_index[4] = {0}; +- int in_index[4] = {0}; +- +- out_index[0] = get_global_id(0); // W +- out_index[1] = get_global_id(1); // H +- out_index[2] = get_global_id(2) % Z_OUT; // C +- out_index[3] = get_global_id(2) / Z_OUT; // B +- +- in_index[0] = out_index[0] / BLOCK_SIZE; +- in_index[1] = out_index[1] / BLOCK_SIZE; +- in_index[2] = out_index[2] + +- ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT; +- in_index[3] = out_index[3]; +- +- *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset( +- &in, in_index[0], in_index[1], in_index[2], in_index[3])); +-} +-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) +- +-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) +-/** Perform space to depth rearrangement of tensor (NHWC) +- * +- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float +- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. +- * e.g. -DDEPTH_OUT=16 +- * @attention The value of the z-axis of output tensor should be given as a preprocessor argument +- * using -DZ_OUT=size. e.g. -DZ_OUT=16 +- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. +- * -DBLOCK_SIZE=1 +- * +- * @param[in] input_ptr Pointer to the source image. Supported data +- * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 +- * @param[in] input_stride_x Stride of the source image in X dimension (in +- * bytes) +- * @param[in] input_step_x input_stride_x * number of elements along X +- * processed per workitem(in bytes) +- * @param[in] input_stride_y Stride of the source image in Y dimension (in +- * bytes) +- * @param[in] input_step_y input_stride_y * number of elements along Y +- * processed per workitem(in bytes) +- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in +- * bytes) +- * @param[in] input_step_z input_stride_z * number of elements along Z +- * processed per workitem(in bytes) +- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source +- * image +- * @param[out] output_ptr Pointer to the destination image. Supported data +- * types: same as @p input_ptr +- * @param[in] output_stride_x Stride of the destination image in X dimension +- * (in bytes) +- * @param[in] output_step_x output_stride_x * number of elements along X +- * processed per workitem(in bytes) +- * @param[in] output_stride_y Stride of the destination image in Y dimension +- * (in bytes) +- * @param[in] output_step_y output_stride_y * number of elements along Y +- * processed per workitem(in bytes) +- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in +- * bytes) +- * @param[in] output_step_z output_stride_z * number of elements along Z +- * processed per workitem(in bytes) +- * @param[in] output_stride_w Stride of the source tensor in W dimension (in +- * bytes) +- * @param[in] output_step_w output_stride_w * number of elements along W +- * processed per workitem(in bytes) +- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the +- * destination image +- */ +-__kernel void depth_to_space_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) +-{ +- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); +- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT); +- +- int out_index[4] = {0}; +- int in_index[4] = {0}; +- +- out_index[0] = get_global_id(0); // C +- out_index[1] = get_global_id(1); // W +- out_index[2] = get_global_id(2) % Z_OUT; // H +- out_index[3] = get_global_id(2) / Z_OUT; // B +- +- in_index[0] = out_index[0] + +- ((out_index[2] % BLOCK_SIZE) * BLOCK_SIZE + out_index[1] % BLOCK_SIZE) * DEPTH_OUT; +- in_index[1] = out_index[1] / BLOCK_SIZE; +- in_index[2] = out_index[2] / BLOCK_SIZE; +- in_index[3] = out_index[3]; +- +- *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset( +- &in, in_index[0], in_index[1], in_index[2], in_index[3])); +-} +-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) +diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h +index 2d0b6a2..e07a25e 100644 +--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h ++++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h +@@ -15,7 +15,7 @@ + */ + + /* +- * Copyright (c) 2016-2018 ARM Limited. ++ * Copyright (c) 2016-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * +@@ -37,7 +37,6 @@ + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +- + #ifndef ARM_COMPUTE_HELPER_H + #define ARM_COMPUTE_HELPER_H + +@@ -59,16 +58,219 @@ + #pragma OPENCL EXTENSION cl_arm_printf : enable + #endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) + ++#define GPU_ARCH_MIDGARD 0x100 ++#define GPU_ARCH_BIFROST 0x200 ++ ++/** Concatenate two inputs. ++ * ++ * @param[in] a The first input to be concatenated ++ * @param[in] b The second input to be concatenated ++ * ++ * @return The concatenated output ++ */ ++#define CONCAT(a, b) a##b ++ ++/** Expand the given vector ++ * ++ * @param[in] x The vector to be expanded ++ * ++ * @return The expanded output ++ */ + #define EXPAND(x) x + ++/** Clamp the given value between an upper and lower bound. ++ * ++ * @param[in] x The value to be clamped ++ * @param[in] min_val The lower bound ++ * @param[in] max_val The upper bound ++ * ++ * @return The clamped value. ++ */ + #define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) + ++/** REVn reverses the given vector whose size is n. ++ * @name REVn ++ * ++ * @param[in] x The vector to be reversed ++ * ++ * @return The reversed vector ++ * @{ ++ */ ++#define REV1(x) ((x)) ++#define REV2(x) ((x).s10) ++#define REV3(x) ((x).s210) ++#define REV4(x) ((x).s3210) ++#define REV8(x) ((x).s76543210) ++#define REV16(x) ((x).sFEDCBA9876543210) ++/** @} */ // end of group REVn ++ ++/** Reverse the given vector. ++ * @name REVERSE ++ * ++ * @param[in] x The vector to be reversed ++ * @param[in] s The size of the vector ++ * ++ * @return The reversed vector ++ * @{ ++ */ ++#define REVERSE_STR(x, s) REV##s((x)) ++#define REVERSE(x, s) REVERSE_STR(x, s) ++/** @} */ // end of group REVERSE ++ ++/** Circular-right-shift (rotate-right) the vector of size s by the amount of n. ++ * @name ROTs_n ++ * ++ * @param[in] x The vector to be shifted ++ * ++ * @return The shifted vector ++ * @{ ++ */ ++#define ROT1_0(x) ((x)) ++ ++#define ROT2_0(x) ((x)) ++#define ROT2_1(x) ((x).s10) ++ ++#define ROT3_0(x) ((x)) ++#define ROT3_1(x) ((x).s201) ++#define ROT3_2(x) ((x).s120) ++ ++#define ROT4_0(x) ((x)) ++#define ROT4_1(x) ((x).s3012) ++#define ROT4_2(x) ((x).s2301) ++#define ROT4_3(x) ((x).s1230) ++ ++#define ROT8_0(x) ((x)) ++#define ROT8_1(x) ((x).s70123456) ++#define ROT8_2(x) ((x).s67012345) ++#define ROT8_3(x) ((x).s56701234) ++#define ROT8_4(x) ((x).s45670123) ++#define ROT8_5(x) ((x).s34567012) ++#define ROT8_6(x) ((x).s23456701) ++#define ROT8_7(x) ((x).s12345670) ++ ++#define ROT16_0(x) ((x)) ++#define ROT16_1(x) ((x).sF0123456789ABCDE) ++#define ROT16_2(x) ((x).sEF0123456789ABCD) ++#define ROT16_3(x) ((x).sDEF0123456789ABC) ++#define ROT16_4(x) ((x).sCDEF0123456789AB) ++#define ROT16_5(x) ((x).sBCDEF0123456789A) ++#define ROT16_6(x) ((x).sABCDEF0123456789) ++#define ROT16_7(x) ((x).s9ABCDEF012345678) ++#define ROT16_8(x) ((x).s89ABCDEF01234567) ++#define ROT16_9(x) ((x).s789ABCDEF0123456) ++#define ROT16_10(x) ((x).s6789ABCDEF012345) ++#define ROT16_11(x) ((x).s56789ABCDEF01234) ++#define ROT16_12(x) ((x).s456789ABCDEF0123) ++#define ROT16_13(x) ((x).s3456789ABCDEF012) ++#define ROT16_14(x) ((x).s23456789ABCDEF01) ++#define ROT16_15(x) ((x).s123456789ABCDEF0) ++/** @} */ // end of group ROTs_n ++ ++/** Circular-right-shift (rotate-right) the given vector by the given amount. ++ * @name ROTATE ++ * ++ * @param[in] x The vector to be shifted ++ * @param[in] s The size of the vector ++ * @param[in] n The amount to be shifted ++ * ++ * @return The shifted vector ++ * @{ ++ */ ++#define ROTATE_STR(x, s, n) ROT##s##_##n(x) ++#define ROTATE(x, s, n) ROTATE_STR(x, s, n) ++/** @} */ // end of group ROTATE ++ ++/** Creates a vector of size n filled with offset values corresponding to the location of each ++ * element. ++ * @name V_OFFSn ++ * ++ * @param[in] dt The data type of the output vector ++ * ++ * @return The vector filled with offset values ++ * @{ ++ */ ++#define V_OFFS1(dt) (dt)(0) ++#define V_OFFS2(dt) (dt)(0, 1) ++#define V_OFFS3(dt) (dt)(0, 1, 3) ++#define V_OFFS4(dt) (dt)(0, 1, 2, 3) ++#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7) ++#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) ++/** @} */ // end of group V_OFFSn ++ ++/** Create a vector filled with offset values corresponding to the location of each element. ++ * @name VEC_OFFS ++ * ++ * @param[in] dt The data type of the output vector ++ * @param[in] s The size of the output vector ++ * ++ * @return The vector filled with offset values ++ * @{ ++ */ ++#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) ++#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) ++/** @} */ // end of group VEC_OFFS ++ + #define VLOAD_STR(size) vload##size + #define VLOAD(size) VLOAD_STR(size) + + #define VSTORE_STR(size) vstore##size + #define VSTORE(size) VSTORE_STR(size) + ++#define float1 float ++#define half1 half ++#define char1 char ++#define uchar1 uchar ++#define short1 short ++#define ushort1 ushort ++#define int1 int ++#define uint1 uint ++#define long1 long ++#define ulong1 ulong ++#define double1 double ++ ++#define vload1(OFFSET, PTR) *(OFFSET + PTR) ++#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA ++ ++// Convert built-in functions with _sat modifier are not supported in floating point so we create ++// defines ++// without _sat to overcome this issue ++#define convert_float_sat convert_float ++#define convert_float1_sat convert_float ++#define convert_float2_sat convert_float2 ++#define convert_float3_sat convert_float3 ++#define convert_float4_sat convert_float4 ++#define convert_float8_sat convert_float8 ++#define convert_float16_sat convert_float16 ++#define convert_half_sat convert_float ++#define convert_half1_sat convert_half ++#define convert_half2_sat convert_half2 ++#define convert_half3_sat convert_half3 ++#define convert_half4_sat convert_half4 ++#define convert_half8_sat convert_half8 ++#define convert_half16_sat convert_half16 ++ ++#define convert_float1 convert_float ++#define convert_half1 convert_half ++#define convert_char1 convert_char ++#define convert_uchar1 convert_uchar ++#define convert_short1 convert_short ++#define convert_ushort1 convert_ushort ++#define convert_int1 convert_int ++#define convert_uint1 convert_uint ++#define convert_long1 convert_long ++#define convert_ulong1 convert_ulong ++#define convert_double1 convert_double ++ ++#define convert_char1_sat convert_char_sat ++#define convert_uchar1_sat convert_uchar_sat ++#define convert_short1_sat convert_short_sat ++#define convert_ushort1_sat convert_ushort_sat ++#define convert_int1_sat convert_int_sat ++#define convert_uint1_sat convert_uint_sat ++#define convert_long1_sat convert_long_sat ++#define convert_ulong1_sat convert_ulong_sat ++#define convert_double1_sat convert_double_sat ++ + #define VEC_DATA_TYPE_STR(type, size) type##size + #define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) + +diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h +index a83b1a8..5f1b3f9 100644 +--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h ++++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h +@@ -15,7 +15,7 @@ + */ + + /* +- * Copyright (c) 2017-2018 ARM Limited. ++ * Copyright (c) 2017-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * +@@ -37,29 +37,112 @@ + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +- + #ifndef ARM_COMPUTE_HELPERS_ASYMM_H + #define ARM_COMPUTE_HELPERS_ASYMM_H + + #include "helpers.h" + ++/** Convert the given vector with round to nearest even rounding mode ++ * ++ * @param[in] x The target to be converted ++ * @param[in] type The target type ++ * ++ * @return The converted vector ++ */ ++#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x))) ++#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type) ++ ++/** Quantize a floating-point scalar value to 8-bit asymmetric ++ * ++ * @param[in] input Input value to quantize ++ * @param[in] offset Quantization offset ++ * @param[in] scale Quantization scale ++ * ++ * @return quantized value ++ */ ++inline uchar quantize_qasymm8(float input, float offset, float scale) ++{ ++ float out_f32 = input / scale + offset; ++ uchar res_u8 = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar); ++ return res_u8; ++} ++ ++/** Dequantize a scalar value from 8-bit asymmetric to floating-point ++ * ++ * @param[in] input Input value to quantize ++ * @param[in] offset Quantization offset ++ * @param[in] scale Quantization scale ++ * ++ * @return quantized value ++ */ ++inline float dequantize_qasymm8(uchar input, float offset, float scale) ++{ ++ return ((float)input - offset) * scale; ++} ++ ++/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point ++ * ++ * @param[in] input Input value to quantize ++ * @param[in] offset Quantization offset ++ * @param[in] scale Quantization scale ++ * ++ * @return quantized value ++ */ ++inline float dequantize_qasymm8_signed(char input, float offset, float scale) ++{ ++ return ((float)input - offset) * scale; ++} ++ ++/** Quantize a vector of values from floating-point ++ * ++ * @param[in] type Output data type. ++ * @param[in] size Size of vector. ++ * ++ * @return quantized values ++ */ ++#define QUANTIZE_IMPL(type, size) \ ++ inline VEC_DATA_TYPE(type, size) \ ++ quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \ ++ { \ ++ VEC_DATA_TYPE(float, size) \ ++ out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \ ++ VEC_DATA_TYPE(type, size) \ ++ res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), \ ++ VEC_DATA_TYPE(type, size)); \ ++ return res; \ ++ } ++ ++/** Dequantize a vector of values to floating-point ++ * ++ * @param[in] type Input data type. ++ * @param[in] size Size of vector. ++ * ++ * @return dequantized values in floating point ++ */ ++#define DEQUANTIZE_IMPL(type, size) \ ++ inline VEC_DATA_TYPE(float, size) \ ++ dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \ ++ { \ ++ return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \ ++ } ++ + /** Correctly-rounded-to-nearest division by a power-of-two. + * + * @param[in] size Size of vector. + * + * @return Correctly-rounded-to-nearest division by a power-of-two. + */ +-#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ +- inline VEC_DATA_TYPE(int, size) \ +- asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, int exponent) \ +- { \ +- VEC_DATA_TYPE(int, size) \ +- mask = (1 << exponent) - 1; \ +- const VEC_DATA_TYPE(int, size) zero = 0; \ +- const VEC_DATA_TYPE(int, size) one = 1; \ +- VEC_DATA_TYPE(int, size) \ +- threshold = (mask >> 1) + select(zero, one, x < 0); \ +- return (x >> exponent) + select(zero, one, (x & mask) > threshold); \ ++#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ ++ inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size( \ ++ VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \ ++ { \ ++ const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0; \ ++ const VEC_DATA_TYPE(int, size) one = (VEC_DATA_TYPE(int, size))1; \ ++ VEC_DATA_TYPE(int, size) \ ++ mask = (one << exponent) - one; \ ++ VEC_DATA_TYPE(int, size) \ ++ threshold = (mask >> 1) + select(zero, one, x < 0); \ ++ return (x >> exponent) + select(zero, one, (x & mask) > threshold); \ + } + + /** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1), +@@ -81,9 +164,19 @@ + b_64 = convert_long##size(b); \ + VEC_DATA_TYPE(long, size) \ + ab_64 = a_64 * b_64; \ +- /* COMPMID-907 */ \ ++ /* Revert COMPMID-907 */ \ ++ VEC_DATA_TYPE(long, size) \ ++ mask1 = 1 << 30; \ ++ VEC_DATA_TYPE(long, size) \ ++ mask2 = 1 - (1 << 30); \ ++ VEC_DATA_TYPE(long, size) \ ++ is_positive_or_zero = ab_64 >= 0; \ ++ VEC_DATA_TYPE(long, size) \ ++ nudge = select(mask2, mask1, is_positive_or_zero); \ ++ VEC_DATA_TYPE(long, size) \ ++ mask = 1ll << 31; \ + VEC_DATA_TYPE(int, size) \ +- ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31)); \ ++ ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask); \ + return select(ab_x2_high32, INT_MAX, overflow); \ + } + +@@ -335,9 +428,18 @@ + return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \ + } + ++#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale) ++#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size) ++#define DEQUANTIZE_STR(input, offset, scale, type, size) \ ++ dequantize_##type##size(input, offset, scale) ++#define DEQUANTIZE(input, offset, scale, type, size) \ ++ DEQUANTIZE_STR(input, offset, scale, type, size) ++ + #define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) \ + asymm_rounding_divide_by_POW2_##size(x, exponent) + #define ASYMM_MULT(a, b, size) asymm_mult##size(a, b) ++#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \ ++ ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size) + #define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \ + ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size) + #define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \ +@@ -360,11 +462,53 @@ + #define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \ + asymm_rescale##size(value, src_integer_bits, dst_integer_bits) + ++#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \ ++ inline VEC_DATA_TYPE(int, size) \ ++ multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \ ++ { \ ++ const int left_shift = shift > 0 ? shift : 0; \ ++ const int right_shift = shift > 0 ? 0 : -shift; \ ++ return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), \ ++ right_shift, size); \ ++ } ++#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \ ++ multiply_by_quantized_multiplier##size(input, qmul, shift) ++ ++QUANTIZE_IMPL(uchar, 1) ++QUANTIZE_IMPL(char, 1) ++QUANTIZE_IMPL(uint, 1) ++QUANTIZE_IMPL(int, 1) ++QUANTIZE_IMPL(uchar, 4) ++QUANTIZE_IMPL(ushort, 4) ++QUANTIZE_IMPL(short, 4) ++QUANTIZE_IMPL(uchar, 16) ++QUANTIZE_IMPL(char, 16) ++QUANTIZE_IMPL(ushort, 16) ++QUANTIZE_IMPL(short, 16) ++QUANTIZE_IMPL(uint, 16) ++QUANTIZE_IMPL(int, 16) ++ ++DEQUANTIZE_IMPL(uchar, 1) ++DEQUANTIZE_IMPL(char, 1) ++DEQUANTIZE_IMPL(uint, 1) ++DEQUANTIZE_IMPL(int, 1) ++DEQUANTIZE_IMPL(uchar, 4) ++DEQUANTIZE_IMPL(ushort, 4) ++DEQUANTIZE_IMPL(short, 4) ++DEQUANTIZE_IMPL(uchar, 16) ++DEQUANTIZE_IMPL(char, 16) ++DEQUANTIZE_IMPL(ushort, 16) ++DEQUANTIZE_IMPL(short, 16) ++DEQUANTIZE_IMPL(uint, 16) ++DEQUANTIZE_IMPL(int, 16) ++ ++ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1) + ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2) + ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4) + ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8) + ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16) + ++ASYMM_MULT_IMPL(1) + ASYMM_MULT_IMPL(2) + ASYMM_MULT_IMPL(4) + ASYMM_MULT_IMPL(8) +@@ -375,16 +519,19 @@ ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4) + ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8) + ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16) + ++ASYMM_SELECT_USING_MASK_IMPL(1) + ASYMM_SELECT_USING_MASK_IMPL(2) + ASYMM_SELECT_USING_MASK_IMPL(4) + ASYMM_SELECT_USING_MASK_IMPL(8) + ASYMM_SELECT_USING_MASK_IMPL(16) + ++ASYMM_MASK_IF_ZERO_IMPL(1) + ASYMM_MASK_IF_ZERO_IMPL(2) + ASYMM_MASK_IF_ZERO_IMPL(4) + ASYMM_MASK_IF_ZERO_IMPL(8) + ASYMM_MASK_IF_ZERO_IMPL(16) + ++ASYMM_MASK_IF_NON_ZERO_IMPL(1) + ASYMM_MASK_IF_NON_ZERO_IMPL(2) + ASYMM_MASK_IF_NON_ZERO_IMPL(4) + ASYMM_MASK_IF_NON_ZERO_IMPL(8) +@@ -400,6 +547,7 @@ ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4) + ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8) + ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16) + ++ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1) + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2) + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4) + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8) +@@ -415,9 +563,16 @@ ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4) + ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8) + ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16) + ++ASYMM_RESCALE_IMPL(1) + ASYMM_RESCALE_IMPL(2) + ASYMM_RESCALE_IMPL(4) + ASYMM_RESCALE_IMPL(8) + ASYMM_RESCALE_IMPL(16) + ++MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1) ++MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2) ++MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4) ++MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8) ++MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16) ++ + #endif // ARM_COMPUTE_HELPERS_ASYMM_H +diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl +deleted file mode 100644 +index 12c8eeb..0000000 +--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl ++++ /dev/null +@@ -1,120 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "helpers.h" +- +-#ifndef VEC_SIZE +-#define VEC_SIZE 1 +-#endif +- +-#if defined(DATA_TYPE) +-/** Returns result of prelu function implemented as below: +- * f(input) = alpha * input for input < 0, f(input) = input for input >= 0. +- * +- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float +- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. +- * -DVEC_SIZE=16 +- * @note Can only take floating point data types. +- * +- * @param[in] input1_ptr Pointer to the source image. Supported Data +- * types : F16/F32 +- * @param[in] input1_stride_x Stride of the source image in X dimension (in +- * bytes) +- * @param[in] input1_step_x input1_stride_x * number of elements along X +- * processed per workitem(in bytes) +- * @param[in] input1_stride_y Stride of the source image in Y dimension (in +- * bytes) +- * @param[in] input1_step_y input1_stride_y * number of elements along Y +- * processed per workitem(in bytes) +- * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in +- * bytes) +- * @param[in] input1_step_z input1_stride_z * number of elements along Z +- * processed per workitem(in bytes) +- * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source +- * image +- * @param[in] alpha_ptr Pointer to the source image. Supported Data +- * types : F16/F32 +- * @param[in] alpha_stride_x Stride of the source image in X dimension (in +- * bytes) +- * @param[in] alpha_step_x input2_stride_x * number of elements along X +- * processed per workitem(in bytes) +- * @param[in] alpha_stride_y Stride of the source image in Y dimension (in +- * bytes) +- * @param[in] alpha_step_y input2_stride_y * number of elements along Y +- * processed per workitem(in bytes) +- * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in +- * bytes) +- * @param[in] alpha_step_z input2_stride_z * number of elements along Z +- * processed per workitem(in bytes) +- * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source +- * image +- * +- * @param[out] output_ptr Pointer to the destination image. Supported +- * data types: same as @p input_ptr +- * @param[in] output_stride_x Stride of the destination image in X dimension +- * (in bytes) +- * @param[in] output_step_x output_stride_x * number of elements along X +- * processed per workitem(in bytes) +- * @param[in] output_stride_y Stride of the destination image in Y dimension +- * (in bytes) +- * @param[in] output_step_y output_stride_y * number of elements along Y +- * processed per workitem(in bytes) +- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in +- * bytes) +- * @param[in] output_step_z output_stride_z * number of elements along Z +- * processed per workitem(in bytes) +- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the +- * destination image +- */ +-__kernel void prelu(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha), +- TENSOR3D_DECLARATION(output)) +-{ +- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); +- Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha); +- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); +- +- VSTORE(VEC_SIZE) +- (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0 +- ? VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) * +- VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr) +- : VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), +- 0, (__global DATA_TYPE *)output.ptr); +-} +-#endif // defined(DATA_TYPE) +diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl +deleted file mode 100644 +index a66e107..0000000 +--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl ++++ /dev/null +@@ -1,138 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "helpers.h" +-#define SUB(x, y) (x) - (y) +- +-#if defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) && \ +- defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE) +- +-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE) +-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) +-#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE) +-#define CONVERT_RTE(x, type) (convert_##type##_rte((x))) +-#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type) +-#define SELECT_TYPE VEC_INT +- +-/** Returns result of prelu function implemented as below: +- * f(input) = alpha * input for input < 0, f(input) = input for input >= 0. +- * +- * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. +- * -DDATA_TYPE_IN=uchar +- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. +- * -DVEC_SIZE=16 +- * @note Can only take uchar data types. +- * +- * @param[in] input1_ptr Pointer to the source image. Supported Data +- * types : QASYMM8 +- * @param[in] input1_stride_x Stride of the source image in X dimension (in +- * bytes) +- * @param[in] input1_step_x input1_stride_x * number of elements along X +- * processed per workitem(in bytes) +- * @param[in] input1_stride_y Stride of the source image in Y dimension (in +- * bytes) +- * @param[in] input1_step_y input1_stride_y * number of elements along Y +- * processed per workitem(in bytes) +- * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in +- * bytes) +- * @param[in] input1_step_z input1_stride_z * number of elements along Z +- * processed per workitem(in bytes) +- * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source +- * image +- * @param[in] alpha_ptr Pointer to the source image. Supported Data +- * types : QASYMM8 +- * @param[in] alpha_stride_x Stride of the source image in X dimension (in +- * bytes) +- * @param[in] alpha_step_x input2_stride_x * number of elements along X +- * processed per workitem(in bytes) +- * @param[in] alpha_stride_y Stride of the source image in Y dimension (in +- * bytes) +- * @param[in] alpha_step_y input2_stride_y * number of elements along Y +- * processed per workitem(in bytes) +- * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in +- * bytes) +- * @param[in] alpha_step_z input2_stride_z * number of elements along Z +- * processed per workitem(in bytes) +- * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source +- * image +- * @param[out] output_ptr Pointer to the destination image. Supported +- * data types: same as @p input_ptr +- * @param[in] output_stride_x Stride of the destination image in X dimension +- * (in bytes) +- * @param[in] output_step_x output_stride_x * number of elements along X +- * processed per workitem(in bytes) +- * @param[in] output_stride_y Stride of the destination image in Y dimension +- * (in bytes) +- * @param[in] output_step_y output_stride_y * number of elements along Y +- * processed per workitem(in bytes) +- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in +- * bytes) +- * @param[in] output_step_z output_stride_z * number of elements along Z +- * processed per workitem(in bytes) +- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the +- * destination image +- */ +-__kernel void prelu_qasymm8(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha), +- TENSOR3D_DECLARATION(output)) +-{ +- // Get pixels pointer +- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); +- Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha); +- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); +- +- VEC_INT in_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT); +- VEC_INT alpha_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT); +- +- in_vec = SUB(in_vec, (VEC_INT)((int)OFF_IN)); +- alpha_vec = SUB(alpha_vec, (VEC_INT)((int)OFF_ALPHA)); +- +- const VEC_FLOAT inf32 = CONVERT(in_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN); +- const VEC_FLOAT alphaf32 = CONVERT(alpha_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_ALPHA); +- const VEC_FLOAT outf32 = +- select(inf32, inf32 * alphaf32, CONVERT(inf32 < (VEC_FLOAT)0, SELECT_TYPE)); +- const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT)); +- const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR); +- +- VSTORE(VEC_SIZE) +- (res, 0, (__global uchar *)output.ptr); +-} +- +-#endif // defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) && +- // defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE) +diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl +deleted file mode 100644 +index eb612f8..0000000 +--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl ++++ /dev/null +@@ -1,185 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016, 2017 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "helpers.h" +- +-#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN) +-/** Perform space to depth rearrangement of tensor +- * +- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float +- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. +- * e.g. -DDEPTH_IN=16 +- * @attention The value of the z-axis of input tensor depth should be given as a preprocessor +- * argument using -DZ_IN=size. e.g. -DZ_IN=16 +- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. +- * -DBLOCK_SIZE=1 +- * +- * @param[in] input_ptr Pointer to the source image. Supported data +- * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 +- * @param[in] input_stride_x Stride of the source image in X dimension (in +- * bytes) +- * @param[in] input_step_x input_stride_x * number of elements along X +- * processed per workitem(in bytes) +- * @param[in] input_stride_y Stride of the source image in Y dimension (in +- * bytes) +- * @param[in] input_step_y input_stride_y * number of elements along Y +- * processed per workitem(in bytes) +- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in +- * bytes) +- * @param[in] input_step_z input_stride_z * number of elements along Z +- * processed per workitem(in bytes) +- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source +- * image +- * @param[out] output_ptr Pointer to the destination image. Supported data +- * types: same as @p input_ptr +- * @param[in] output_stride_x Stride of the destination image in X dimension +- * (in bytes) +- * @param[in] output_step_x output_stride_x * number of elements along X +- * processed per workitem(in bytes) +- * @param[in] output_stride_y Stride of the destination image in Y dimension +- * (in bytes) +- * @param[in] output_step_y output_stride_y * number of elements along Y +- * processed per workitem(in bytes) +- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in +- * bytes) +- * @param[in] output_step_z output_stride_z * number of elements along Z +- * processed per workitem(in bytes) +- * @param[in] output_stride_w Stride of the source tensor in W dimension (in +- * bytes) +- * @param[in] output_step_w output_stride_w * number of elements along W +- * processed per workitem(in bytes) +- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the +- * destination image +- */ +-__kernel void space_to_depth_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) +-{ +- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN); +- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); +- +- int out_index[4] = {0}; +- int in_index[4] = {0}; +- +- in_index[0] = get_global_id(0); // W +- in_index[1] = get_global_id(1); // H +- in_index[2] = get_global_id(2) % Z_IN; // C +- in_index[3] = get_global_id(2) / Z_IN; // B +- +- out_index[0] = in_index[0] / BLOCK_SIZE; +- out_index[1] = in_index[1] / BLOCK_SIZE; +- out_index[2] = +- in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN; +- out_index[3] = in_index[3]; +- +- *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2], +- out_index[3])) = *((__global DATA_TYPE *)in.ptr); +-} +-#endif // defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN) +- +-#if defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN) +-/** Perform space to depth rearrangement of tensor +- * +- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float +- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. +- * e.g. -DDEPTH_IN=16 +- * @attention The value of the z-axis of input tensor depth should be given as a preprocessor +- * argument using -DZ_IN=size. e.g. -DZ_IN=16 +- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. +- * -DBLOCK_SIZE=1 +- * +- * @param[in] input_ptr Pointer to the source image. Supported data +- * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 +- * @param[in] input_stride_x Stride of the source image in X dimension (in +- * bytes) +- * @param[in] input_step_x input_stride_x * number of elements along X +- * processed per workitem(in bytes) +- * @param[in] input_stride_y Stride of the source image in Y dimension (in +- * bytes) +- * @param[in] input_step_y input_stride_y * number of elements along Y +- * processed per workitem(in bytes) +- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in +- * bytes) +- * @param[in] input_step_z input_stride_z * number of elements along Z +- * processed per workitem(in bytes) +- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source +- * image +- * @param[out] output_ptr Pointer to the destination image. Supported data +- * types: same as @p input_ptr +- * @param[in] output_stride_x Stride of the destination image in X dimension +- * (in bytes) +- * @param[in] output_step_x output_stride_x * number of elements along X +- * processed per workitem(in bytes) +- * @param[in] output_stride_y Stride of the destination image in Y dimension +- * (in bytes) +- * @param[in] output_step_y output_stride_y * number of elements along Y +- * processed per workitem(in bytes) +- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in +- * bytes) +- * @param[in] output_step_z output_stride_z * number of elements along Z +- * processed per workitem(in bytes) +- * @param[in] output_stride_w Stride of the source tensor in W dimension (in +- * bytes) +- * @param[in] output_step_w output_stride_w * number of elements along W +- * processed per workitem(in bytes) +- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the +- * destination image +- */ +-__kernel void space_to_depth_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) +-{ +- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN); +- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); +- +- int out_index[4] = {0}; +- int in_index[4] = {0}; +- +- in_index[0] = get_global_id(0); // C +- in_index[1] = get_global_id(1); // W +- in_index[2] = get_global_id(2) % Z_IN; // H +- in_index[3] = get_global_id(2) / Z_IN; // B +- +- out_index[0] = +- in_index[0] + ((in_index[2] % BLOCK_SIZE) * BLOCK_SIZE + in_index[1] % BLOCK_SIZE) * DEPTH_IN; +- out_index[1] = in_index[1] / BLOCK_SIZE; +- out_index[2] = in_index[2] / BLOCK_SIZE; +- out_index[3] = in_index[3]; +- +- *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2], +- out_index[3])) = *((__global DATA_TYPE *)in.ptr); +-} +-#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN) +diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp +deleted file mode 100644 +index 06eeb5b..0000000 +--- a/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp ++++ /dev/null +@@ -1,181 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h" +- +-#include "arm_compute/core/CL/CLHelpers.h" +-#include "arm_compute/core/CL/CLKernelLibraryEx.h" +-#include "arm_compute/core/CL/ICLTensor.h" +- +-using namespace arm_compute; +- +-namespace +-{ +-const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis) +-{ +- TensorShape out_shape{input_shape}; +- +- out_shape.set(axis, 1); +- +- return out_shape; +-} +-} // namespace +- +-namespace +-{ +-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, +- ArgOperation /*op*/) +-{ +- ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::S32, DataType::F32, DataType::U8, +- DataType::QASYMM8); +- ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32); +- +- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape().num_dimensions() - 1) != +- output->tensor_shape().num_dimensions(), +- "Input's rank is not same with output"); +- +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, +- "Inputs are not broadcast compatible"); +- +- const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis); +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(), +- "output shape's size does not match axis"); +- +- const auto num_dimensions = input->tensor_shape().num_dimensions(); +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= num_dimensions, "axis must be less than (input's rank)."); +- return Status{}; +-} +- +-} // namespace +- +-CLArgOperationKernel::CLArgOperationKernel() : _input(nullptr), _output(nullptr), _axis() {} +- +-void CLArgOperationKernel::configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis, +- ArgOperation op) +-{ +- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); +- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); +- +- _input = input; +- _output = output; +- _axis = axis; +- +- std::unique_ptr output_info = output->info()->clone(); +- output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis)); +- +- // Construct kernel and set op_code based on type of ArgOperation as specified by object op +- std::string kernel_name = "arg_op"; +- int op_code = 0; +- if (op == ArgOperation::MAX) +- { +- op_code = 1; +- } +- else if (op == ArgOperation::MIN) +- { +- op_code = 2; +- } +- else +- throw std::runtime_error("Operation not supported, yet"); +- +- // Set kernel build options +- std::set build_opts; +- build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); +- build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2))); +- build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code)); +- +- // Create kernel +- _kernel = +- static_cast(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); +- +- // Configure kernel window +- Window win = calculate_max_window(*output_info, Steps()); +- +- Coordinates coord; +- coord.set_num_dimensions(output_info->num_dimensions()); +- output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape())); +- +- ICLKernel::configure_internal(win); +-} +- +-Status CLArgOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, +- const uint32_t axis, ArgOperation op) +-{ +- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); +- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); +- +- return Status{}; +-} +- +-void CLArgOperationKernel::run(const Window &window, cl::CommandQueue &queue) +-{ +- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); +- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); +- +- const TensorShape &shape_in = _input->info()->tensor_shape(); +- +- unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters +- +- _kernel.setArg(idx++, _axis); +- _kernel.setArg(idx++, shape_in[_axis]); +- +- Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); +- +- // Setup input slice +- Window slice_in(slice_out); +- slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); +- slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); +- slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); +- slice_in.set(3, Window::Dimension(0, 0, 0)); +- +- // Copy output's shape in order to use for recovering at end of this method +- const TensorShape shape_out = _output->info()->tensor_shape(); +- _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis)); +- +- do +- { +- unsigned int idx = 0; +- add_4D_tensor_argument(idx, _input, slice_in); +- add_4D_tensor_argument(idx, _output, slice_out); +- enqueue(queue, *this, slice_out); +- } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); +- +- // Recover output's shape of output tensor +- _output->info()->set_tensor_shape(shape_out); +-} +diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp +index bb55568..fbc76f5 100644 +--- a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp ++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp +@@ -43,6 +43,7 @@ + #include "arm_compute/core/CL/CLHelpers.h" + #include "arm_compute/core/CL/CLKernelLibraryEx.h" + #include "arm_compute/core/CL/ICLTensor.h" ++#include "support/StringSupport.h" + + using namespace arm_compute; + +diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp +deleted file mode 100644 +index 01ea655..0000000 +--- a/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp ++++ /dev/null +@@ -1,132 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/core/CL/kernels/CLCastKernel.h" +- +-#include "arm_compute/core/CL/CLHelpers.h" +-#include "arm_compute/core/CL/CLKernelLibraryEx.h" +-#include "arm_compute/core/CL/ICLTensor.h" +- +-using namespace arm_compute; +- +-CLCastKernel::CLCastKernel() : _input(nullptr), _output(nullptr) {} +- +-void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output, SubDataType input_subtype) +-{ +- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); +- +- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, +- DataType::S16, DataType::S32, DataType::F16, +- DataType::F32); +- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, +- DataType::S16, DataType::S32, DataType::F16, +- DataType::F32); +- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); +- +- _input = input; +- _output = output; +- +- constexpr unsigned int num_elems_processed_per_iteration = 16; +- +- // Set kernel build options +- CLBuildOptions build_opts; +- build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type())); +- build_opts.add_option("-DDATA_TYPE_OUT=" + +- get_cl_type_from_data_type(output->info()->data_type())); +- build_opts.add_option( +- ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); +- +- // Create kernel +- if (is_data_type_quantized_asymmetric(input->info()->data_type())) +- { +- UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform(); +- const float scale_in = qinfo.scale; +- const int offset_in = qinfo.offset; +- build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_in)); +- build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_in)); +- +- _kernel = static_cast( +- CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts.options())); +- } +- else if (is_data_type_quantized_asymmetric(output->info()->data_type())) +- { +- UniformQuantizationInfo qinfo = output->info()->quantization_info().uniform(); +- const float scale_in = qinfo.scale; +- const float offset_in = qinfo.offset; +- +- build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_in)); +- build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_in)); +- +- _kernel = static_cast( +- CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts.options())); +- } +- else +- { +- build_opts.add_option_if(input_subtype == SubDataType::BOOL, "-DBOOL_INPUT"); +- _kernel = static_cast( +- CLKernelLibraryEx::get().create_kernel("cast", build_opts.options())); +- } +- +- // Configure kernel window +- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); +- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); +- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); +- update_window_and_padding(win, input_access, output_access); +- output_access.set_valid_region(win, input->info()->valid_region()); +- +- ICLKernel::configure_internal(win); +-} +- +-void CLCastKernel::run(const Window &window, cl::CommandQueue &queue) +-{ +- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); +- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); +- +- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); +- Window slice = collapsed.first_slice_window_3D(); +- +- do +- { +- unsigned int idx = 0; +- add_3D_tensor_argument(idx, _input, slice); +- add_3D_tensor_argument(idx, _output, slice); +- enqueue(queue, *this, slice, lws_hint()); +- } while (collapsed.slide_window_slice_3D(slice)); +-} +diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp +deleted file mode 100644 +index 3891368..0000000 +--- a/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp ++++ /dev/null +@@ -1,140 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h" +- +-#include "arm_compute/core/CL/CLHelpers.h" +-#include "arm_compute/core/CL/CLKernelLibraryEx.h" +-#include "arm_compute/core/CL/ICLTensor.h" +- +-using namespace arm_compute; +- +-namespace +-{ +-// TODO Use this validation function +-#if 0 +-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, +- const int32_t block_size) +-{ +- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, +- DataType::S16, DataType::S32, DataType::F16, +- DataType::F32); +- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, +- DataType::S16, DataType::S32, DataType::F16, +- DataType::F32); +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1, +- "Block size should be greater than or equal to 1."); +- +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0) * block_size, +- "Output width should be equal to (Input width * block size)"); +- +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(1) != input->dimension(1) * block_size, +- "Output height should be equal to (Input height * block size)"); +- +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) % (block_size * block_size) != 0, +- "Input depth should be divisible by (block size * block size)"); +- +- ARM_COMPUTE_RETURN_ERROR_ON_MSG( +- output->dimension(2) != input->dimension(2) / (block_size * block_size), +- "Output depth should be equal to (Input depth / (block size * block size))"); +- +- return Status{}; +-} +-#endif +-} // namespace +- +-CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr) +-{ +- // DO NOTHING +-} +- +-void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output, +- const int32_t block_size) +-{ +- // TODO Add validation of data_layout +- _input = input; +- _output = output; +- +- // Set kernel build options +- auto layout_out = output->info()->data_layout(); +- std::set build_opts; +- build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); +- build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size)); +- auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL); +- auto depth = output->info()->dimension(index_depth); +- build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(depth)); +- build_opts.emplace("-DZ_OUT=" + support::cpp11::to_string(output->info()->tensor_shape().z())); +- +- // Create kernel +- _kernel = static_cast(CLKernelLibraryEx::get().create_kernel( +- "depth_to_space_" + lower_string(string_from_data_layout(layout_out)), build_opts)); +- +- // Configure kernel window +- Window win = calculate_max_window(*output->info(), Steps()); +- +- Coordinates coord; +- coord.set_num_dimensions(output->info()->num_dimensions()); +- output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); +- +- ICLKernel::configure_internal(win); +-} +- +-void CLDepthToSpaceKernel::run(const Window &window, cl::CommandQueue &queue) +-{ +- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); +- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); +- +- Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); +- +- // Setup input slice +- Window slice_in(slice_out); +- slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); +- slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); +- slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); +- slice_in.set(3, Window::Dimension(0, 0, 0)); +- +- do +- { +- unsigned int idx = 0; +- add_4D_tensor_argument(idx, _input, slice_in); +- add_4D_tensor_argument(idx, _output, slice_out); +- enqueue(queue, *this, slice_out); +- } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); +-} +diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp +index 79f5ce0..67aaf2d 100644 +--- a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp ++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp +@@ -43,6 +43,7 @@ + #include "arm_compute/core/CL/CLHelpers.h" + #include "arm_compute/core/CL/CLKernelLibraryEx.h" + #include "arm_compute/core/CL/ICLTensor.h" ++#include "support/StringSupport.h" + + using namespace arm_compute; + +diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp +deleted file mode 100644 +index 235e897..0000000 +--- a/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp ++++ /dev/null +@@ -1,372 +0,0 @@ +-/* +- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2017-2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h" +- +-#include "arm_compute/core/AccessWindowStatic.h" +-#include "arm_compute/core/AccessWindowTranspose.h" +-#include "arm_compute/core/CL/CLHelpers.h" +-#include "arm_compute/core/CL/CLKernelLibraryEx.h" +-#include "arm_compute/core/CL/ICLTensor.h" +-#include "arm_compute/core/CL/OpenCL.h" +-#include "arm_compute/core/Error.h" +-#include "arm_compute/core/Helpers.h" +-#include "arm_compute/core/TensorInfo.h" +-#include "arm_compute/core/Types.h" +-#include "arm_compute/core/Utils.h" +-#include "arm_compute/core/Validate.h" +-#include "arm_compute/core/Window.h" +-#include "arm_compute/core/utils/misc/ShapeCalculator.h" +-#include "support/ToolchainSupport.h" +- +-#include +-#include +-#include +- +-using namespace arm_compute; +-using namespace arm_compute::misc::shape_calculator; +- +-namespace arm_compute +-{ +-class Coordinates; +-} // namespace arm_compute +- +-namespace +-{ +-using ElementsProcessed = Steps; +- +-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, +- const ITensorInfo *output, const GEMMReshapeInfo &gemm_info) +-{ +- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output); +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8); +- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, +- "The number of dimensions for the matrix A must be <= 4"); +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, +- "The number of dimensions for the matrix B must be <= 3"); +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 2 && +- gemm_info.reinterpret_input_as_3d(), +- "The input1 tensor cannot have more than 2 dimensions if input0 " +- "has to be reinterpreted as 3D"); +- +- const int m = gemm_info.m(); +- const int n = gemm_info.n(); +- const int k = gemm_info.k(); +- +- ARM_COMPUTE_UNUSED(m); +- ARM_COMPUTE_UNUSED(n); +- ARM_COMPUTE_UNUSED(k); +- +- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast(k)); +- ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != static_cast(n)); +- ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(1) != static_cast(k)); +- if (gemm_info.reinterpret_input_as_3d()) +- { +- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != +- static_cast(m)); +- } +- else +- { +- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast(m)); +- } +- +- if (output->total_size() != 0) +- { +- const TensorInfo tensor_info_output = +- output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, false, gemm_info)); +- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); +- } +- +- return Status{}; +-} +- +-std::pair validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, +- ITensorInfo *output, +- const GEMMReshapeInfo &gemm_info, +- ElementsProcessed &num_elements_processed) +-{ +- unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; +- unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; +- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); +- bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); +- +- Window win{}; +- Window win_out{}; +- bool window_changed = false; +- +- // In case both input and output have to be reinterpreted as 3D tensors, +- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. +- if (reinterpret_input_as_3d == reinterpret_output_as_3d) +- { +- reinterpret_input_as_3d = false; +- reinterpret_output_as_3d = false; +- } +- +- // Output tensor auto inizialitation if not yet initialized +- auto_init_if_empty(*output, +- input0->clone() +- ->set_tensor_shape(compute_mm_shape(*input0, *input1, false, gemm_info)) +- .set_data_type(DataType::S32)); +- +- TensorInfo tmp_info(*output); +- +- if (reinterpret_output_as_3d) +- { +- // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D +- // GEMM, +- // the window needs to be constructed on the 2D collapsed version of the tensor +- TensorShape tmp_shape(output->tensor_shape()); +- tmp_shape.collapse(2U, 1U); +- tmp_info.set_tensor_shape(tmp_shape); +- } +- +- // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x +- // Note: if the dot product instruction is available, the 8x2 tile has to be used +- num_elems_processed_per_iteration_x = 4; +- num_elems_processed_per_iteration_y = std::min(static_cast(output->dimension(1)), 4); +- +- // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor +- // The only way to set properly the paddings, it is to set those explicitly through the +- // AccessWindowStatic +- const int m = reinterpret_input_as_3d ? input0->tensor_shape()[1] * input0->tensor_shape()[2] +- : input0->tensor_shape()[1]; +- const int bottom_pad = +- (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % +- num_elems_processed_per_iteration_y; +- +- // Configure window +- win = calculate_max_window( +- tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); +- win_out = calculate_max_window( +- *output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); +- +- AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), +- input0->dimension(1) + bottom_pad); +- AccessWindowStatic input1_access( +- input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), +- input1->dimension(1)); +- AccessWindowStatic output_access( +- output, 0, 0, ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x), +- output->dimension(1) + bottom_pad); +- +- window_changed = +- update_window_and_padding(win, input0_access, +- input1_access) || // window used by the execute_window_loop +- update_window_and_padding( +- win_out, +- output_access); // window used to update the padding requirements of output tensor +- +- Coordinates coord; +- coord.set_num_dimensions(output->num_dimensions()); +- output_access.set_valid_region(win_out, ValidRegion(coord, output->tensor_shape())); +- +- // Collapse along the Z direction +- // This collapse needs to be here in order to tune the Z dimension of LWS +- Window collapsed = win; +- const unsigned int dimension_to_collapse = +- std::min(static_cast(output->num_dimensions()), 2u); +- collapsed = win.collapse(win, dimension_to_collapse); +- +- Status err = (window_changed) +- ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") +- : Status{}; +- return std::make_pair(err, collapsed); +-} +-} // namespace +- +-CLGEMMLowpMatrixMultiplyKernelEx::CLGEMMLowpMatrixMultiplyKernelEx() +- : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), +- _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false) +-{ +-} +- +-void CLGEMMLowpMatrixMultiplyKernelEx::configure(const ICLTensor *input0, const ICLTensor *input1, +- ICLTensor *output, +- const GEMMReshapeInfo &gemm_info) +-{ +- ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); +- +- ARM_COMPUTE_ERROR_THROW_ON( +- validate_arguments(input0->info(), input1->info(), output->info(), gemm_info)); +- +- _input0 = input0; +- _input1 = input1; +- _output = output; +- _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); +- _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); +- +- // In case both input and output have to be reinterpreted as 3D tensors, +- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. +- if (_reinterpret_input_as_3d == _reinterpret_output_as_3d) +- { +- _reinterpret_input_as_3d = false; +- _reinterpret_output_as_3d = false; +- } +- +- // Check if we need to slide the matrix B +- const unsigned int num_dimensions_input0 = _reinterpret_input_as_3d +- ? _input0->info()->num_dimensions() - 1 +- : _input0->info()->num_dimensions(); +- _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0); +- +- ElementsProcessed num_elements_processed{}; +- +- // Configure kernel window +- auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), +- gemm_info, num_elements_processed); +- ARM_COMPUTE_ERROR_THROW_ON(win_config.first); +- ICLKernel::configure_internal(win_config.second); +- +- // Create build options +- std::string kernel_name(" "); +- CLBuildOptions build_opts; +- build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); +- build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); +- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, +- "-DHEIGHT_GEMM3D=" + +- support::cpp11::to_string(output->info()->dimension(1))); +- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, +- "-DDEPTH_GEMM3D=" + +- support::cpp11::to_string(output->info()->dimension(2))); +- build_opts.add_option_if(!_slide_matrix_b, +- "-DMATRIX_B_DEPTH=" + +- support::cpp11::to_string(input1->info()->dimension(2))); +- build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0))); +- build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + +- support::cpp11::to_string(num_elements_processed.x())); +- build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + +- support::cpp11::to_string(num_elements_processed.y())); +- +- kernel_name = "gemmlowp_mm_midgard_ex"; +- +- // Create kernel +- _kernel = static_cast( +- CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options())); +- +- // Set config_id for enabling LWS tuning +- _config_id = kernel_name; +- _config_id += "_"; +- _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); +- _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); +- _config_id += lower_string(string_from_data_type(input0->info()->data_type())); +- _config_id += "_"; +- _config_id += support::cpp11::to_string(output->info()->dimension(1)); +- _config_id += "_"; +- _config_id += support::cpp11::to_string(output->info()->dimension(0)); +-} +- +-Status CLGEMMLowpMatrixMultiplyKernelEx::validate(const ITensorInfo *input0, +- const ITensorInfo *input1, +- const ITensorInfo *output, +- const GEMMReshapeInfo &gemm_info) +-{ +- ElementsProcessed num_elements_processed{}; +- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, gemm_info)); +- ARM_COMPUTE_RETURN_ON_ERROR( +- validate_and_configure_window(input0->clone().get(), input1->clone().get(), +- output->clone().get(), gemm_info, num_elements_processed) +- .first); +- +- return Status{}; +-} +- +-void CLGEMMLowpMatrixMultiplyKernelEx::run(const Window &window, cl::CommandQueue &queue) +-{ +- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); +- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); +- +- if (_input1->info()->num_dimensions() < 3) +- { +- // The stride_z for matrix B must be zero if we do not slice +- ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0); +- } +- +- Window slice = window.first_slice_window_3D(); +- Window slice_matrix_b = slice; +- +- slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); +- slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); +- +- if (_reinterpret_input_as_3d) +- { +- // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor +- const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3; +- const unsigned int total_cross_plane_pad = +- _input0->info()->padding().top + _input0->info()->padding().bottom; +- _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); +- } +- +- if (_reinterpret_output_as_3d) +- { +- // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor +- const unsigned int idx0 = +- 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0); +- const unsigned int total_cross_plane_pad = +- _output->info()->padding().top + _output->info()->padding().bottom; +- _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); +- } +- +- do +- { +- Window slice_b = slice; +- // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A +- // more than 2 +- // This scenario can happen when the matrix multiplication is used to perform a convolution +- // operation +- if (!_slide_matrix_b) +- { +- slice_b = slice_matrix_b; +- } +- +- unsigned int idx = 0; +- add_2D_tensor_argument(idx, _input0, slice); +- add_2D_tensor_argument(idx, _input1, slice_b); +- add_2D_tensor_argument(idx, _output, slice); +- _kernel.setArg(idx++, +- static_cast(_input0->info()->strides_in_bytes()[2])); +- _kernel.setArg(idx++, +- static_cast(_input1->info()->strides_in_bytes()[2])); +- _kernel.setArg(idx++, +- static_cast(_output->info()->strides_in_bytes()[2])); +- enqueue(queue, *this, slice, lws_hint()); +- } while (window.slide_window_slice_3D(slice)); +-} +diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp +index 3a25987..3bfe3e4 100644 +--- a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp ++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp +@@ -45,6 +45,7 @@ + #include "arm_compute/core/CL/ICLTensor.h" + #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" + #include "arm_compute/core/UtilsEx.h" ++#include "support/StringSupport.h" + + using namespace arm_compute; + +diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp +index 7fbdcda..930e7c9 100644 +--- a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp ++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp +@@ -43,6 +43,7 @@ + #include "arm_compute/core/CL/CLHelpers.h" + #include "arm_compute/core/CL/CLKernelLibraryEx.h" + #include "arm_compute/core/CL/ICLTensor.h" ++#include "support/StringSupport.h" + + using namespace arm_compute; + +@@ -110,7 +111,7 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso + _hits = hits; + + // Make _lookup_indices tensor +- _lookup_indices = arm_compute::support::cpp14::make_unique(); ++ _lookup_indices = support::cpp14::make_unique(); + _lookup_indices->allocator()->init( + TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32)); + _lookup_indices->allocator()->allocate(); +diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp +index b45f6bb..61c14d2 100644 +--- a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp ++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp +@@ -48,7 +48,7 @@ + #include "arm_compute/core/TensorInfo.h" + #include "arm_compute/core/Utils.h" + #include "arm_compute/core/Window.h" +- ++#include "support/StringSupport.h" + #include "support/ToolchainSupport.h" + + namespace arm_compute +diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp +index d305896..6b27c99 100644 +--- a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp ++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp +@@ -49,6 +49,7 @@ + #include "arm_compute/core/Utils.h" + #include "arm_compute/core/Validate.h" + #include "arm_compute/core/Window.h" ++#include "support/StringSupport.h" + + using namespace arm_compute; + +diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp +index 74f7b41..643c8b1 100644 +--- a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp ++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp +@@ -43,6 +43,7 @@ + #include "arm_compute/core/CL/CLHelpers.h" + #include "arm_compute/core/CL/CLKernelLibraryEx.h" + #include "arm_compute/core/CL/ICLTensor.h" ++#include "support/StringSupport.h" + + using namespace arm_compute; + +diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp +deleted file mode 100644 +index 8910a7b..0000000 +--- a/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp ++++ /dev/null +@@ -1,210 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/core/CL/kernels/CLPReLUKernel.h" +- +-#include "arm_compute/core/CL/CLHelpers.h" +-#include "arm_compute/core/CL/CLKernelLibraryEx.h" +-#include "arm_compute/core/CL/ICLTensor.h" +- +-using namespace arm_compute; +- +-namespace +-{ +-constexpr unsigned int num_elems_processed_per_iteration = 16; +- +-Status validate_info(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output) +-{ +- const TensorShape &out_shape = +- TensorShape::broadcast_shape(input->tensor_shape(), alpha->tensor_shape()); +- +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, +- DataType::QASYMM8); +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(alpha, 1, DataType::F16, DataType::F32, +- DataType::QASYMM8); +- +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, +- "Inputs are not broadcast compatible"); +- // Validate in case of configured output +- if (output->total_size() > 0) +- { +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32, +- DataType::QASYMM8); +- ARM_COMPUTE_RETURN_ERROR_ON_MSG( +- detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), +- "Wrong shape for output"); +- } +- return Status{}; +-} +-} // namespace +- +-CLPReLUKernel::CLPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {} +- +-void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output) +-{ +- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, alpha); +- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); +- ARM_COMPUTE_ERROR_THROW_ON(validate_info(input->info(), alpha->info(), output->info())); +- +- _input = input; +- _alpha = alpha; +- _output = output; +- +- // Create kernel +- std::string kernel_name = "prelu"; +- std::set build_opts; +- build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); +- build_opts.emplace( +- ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); +- +- if (is_data_type_quantized_asymmetric(input->info()->data_type())) +- { +- build_opts.emplace("-DOFF_IN=" + support::cpp11::to_string( +- input->info()->quantization_info().uniform().offset)); +- build_opts.emplace("-DOFF_ALPHA=" + support::cpp11::to_string( +- alpha->info()->quantization_info().uniform().offset)); +- build_opts.emplace("-DOFF_OUT=" + support::cpp11::to_string( +- output->info()->quantization_info().uniform().offset)); +- build_opts.emplace("-DSCALE_IN=" + support::cpp11::to_string( +- input->info()->quantization_info().uniform().scale)); +- build_opts.emplace("-DSCALE_ALPHA=" + support::cpp11::to_string( +- alpha->info()->quantization_info().uniform().scale)); +- build_opts.emplace("-DSCALE_OUT=" + support::cpp11::to_string( +- output->info()->quantization_info().uniform().scale)); +- kernel_name += "_qasymm8"; +- } +- _kernel = +- static_cast(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); +- +- const std::pair broadcast_pair = +- ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info()); +- +- const TensorShape &out_shape = broadcast_pair.first; +- const ValidRegion &valid_region = broadcast_pair.second; +- +- // Auto initialize output if not initialized +- { +- set_shape_if_empty(*output->info(), out_shape); +- +- if (input->info()->data_type() == DataType::F16 && alpha->info()->data_type() == DataType::F16) +- { +- set_format_if_unknown(*output->info(), Format::F16); +- } +- else if (input->info()->data_type() == DataType::F32 || +- alpha->info()->data_type() == DataType::F32) +- { +- set_format_if_unknown(*output->info(), Format::F32); +- } +- } +- +- Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); +- Window win_input1 = win.broadcast_if_dimension_le_one(*input->info()); +- Window win_input2 = win.broadcast_if_dimension_le_one(*alpha->info()); +- +- AccessWindowHorizontal input1_access(input->info(), 0, num_elems_processed_per_iteration); +- AccessWindowHorizontal input2_access(alpha->info(), 0, num_elems_processed_per_iteration); +- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); +- +- update_window_and_padding(win_input1, input1_access) || +- update_window_and_padding(win_input2, input2_access) || +- update_window_and_padding(win, output_access); +- +- output_access.set_valid_region(win, valid_region); +- +- ICLKernel::configure_internal(win); +-} +- +-void CLPReLUKernel::run(const Window &window, cl::CommandQueue &queue) +-{ +- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); +- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); +- +- const TensorShape &in_shape1 = _input->info()->tensor_shape(); +- const TensorShape &in_shape2 = _alpha->info()->tensor_shape(); +- const TensorShape &out_shape = _output->info()->tensor_shape(); +- +- bool can_collapse = true; +- if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) +- { +- can_collapse = +- (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); +- for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) +- { +- can_collapse = (in_shape1[d] == in_shape2[d]); +- } +- } +- +- bool has_collapsed = false; +- Window collapsed = +- can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) +- : window; +- +- const TensorShape &in_shape1_collapsed = +- has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; +- const TensorShape &in_shape2_collapsed = +- has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; +- +- Window slice = collapsed.first_slice_window_3D(); +- Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); +- Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); +- +- do +- { +- unsigned int idx = 0; +- add_3D_tensor_argument(idx, _input, slice_input1); +- add_3D_tensor_argument(idx, _alpha, slice_input2); +- add_3D_tensor_argument(idx, _output, slice); +- +- enqueue(queue, *this, slice); +- +- collapsed.slide_window_slice_3D(slice_input1); +- collapsed.slide_window_slice_3D(slice_input2); +- } while (collapsed.slide_window_slice_3D(slice)); +-} +- +-BorderSize CLPReLUKernel::border_size() const +-{ +- const unsigned int replicateSize = +- _output->info()->dimension(0) - +- std::min(_input->info()->dimension(0), _alpha->info()->dimension(0)); +- const unsigned int border = +- std::min(num_elems_processed_per_iteration - 1U, replicateSize); +- return BorderSize(0, border, 0, 0); +-} +diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp +index 2d551f6..1a7a18c 100644 +--- a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp ++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp +@@ -49,6 +49,7 @@ + #include "arm_compute/core/Utils.h" + #include "arm_compute/core/Validate.h" + #include "arm_compute/core/Window.h" ++#include "support/StringSupport.h" + + namespace arm_compute + { +@@ -69,7 +70,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_fac + + // Output must always be initialized + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0); +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S8); ++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + + return Status{}; +diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp +index a983183..06c2579 100644 +--- a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp ++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp +@@ -43,6 +43,7 @@ + #include "arm_compute/core/CL/CLHelpers.h" + #include "arm_compute/core/CL/CLKernelLibraryEx.h" + #include "arm_compute/core/CL/ICLTensor.h" ++#include "support/StringSupport.h" + + using namespace arm_compute; + namespace +diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp +index ff1904a..8d8853c 100644 +--- a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp ++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp +@@ -48,6 +48,7 @@ + #include "arm_compute/core/Validate.h" + #include "arm_compute/core/Window.h" + #include "arm_compute/core/utils/misc/ShapeCalculator.h" ++#include "support/StringSupport.h" + + #include + +diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp +deleted file mode 100644 +index 64fc038..0000000 +--- a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp ++++ /dev/null +@@ -1,148 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h" +- +-#include "arm_compute/core/CL/CLHelpers.h" +-#include "arm_compute/core/CL/CLKernelLibraryEx.h" +-#include "arm_compute/core/CL/ICLTensor.h" +- +-using namespace arm_compute; +- +-namespace +-{ +-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, +- const int32_t block_size) +-{ +- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, +- DataType::S16, DataType::S32, DataType::F16, +- DataType::F32); +- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, +- DataType::S16, DataType::S32, DataType::F16, +- DataType::F32); +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1, +- "Block size should be greater than or equal to 1."); +- +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) != output->dimension(3), +- "Input batch should be equal to Output batch"); +- +- auto layout_out = input->data_layout(); +- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); +- +- auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL); +- auto index_height = get_data_layout_dimension_index(layout_out, DataLayoutDimension::HEIGHT); +- auto index_width = get_data_layout_dimension_index(layout_out, DataLayoutDimension::WIDTH); +- ARM_COMPUTE_RETURN_ERROR_ON_MSG( +- input->dimension(index_depth) * block_size * block_size != output->dimension(index_depth), +- "Output depth should be equal to (input depth * block size *block size)"); +- +- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->dimension(index_width) % block_size) || +- (input->dimension(index_height) % block_size), +- "Input height and width should be divisible by block size"); +- +- ARM_COMPUTE_RETURN_ERROR_ON_MSG( +- (output->dimension(index_width) != (input->dimension(index_width) / block_size)) || +- (output->dimension(index_height) != (input->dimension(index_height) / block_size)), +- "Output height and width should be equal to " +- "input_height/blocksize and input_width/blocksize respectively"); +- +- return Status{}; +-} +- +-} // namespace +- +-CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {} +- +-void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output, +- const int32_t block_size) +-{ +- +- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); +- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size)); +- +- _input = input; +- _output = output; +- +- // Set kernel build options +- auto layout_out = input->info()->data_layout(); +- std::set build_opts; +- build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); +- build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size)); +- auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL); +- auto depth = input->info()->dimension(index_depth); +- build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(depth)); +- build_opts.emplace("-DZ_IN=" + support::cpp11::to_string(input->info()->tensor_shape().z())); +- +- // Create kernel +- _kernel = static_cast(CLKernelLibraryEx::get().create_kernel( +- "space_to_depth_" + lower_string(string_from_data_layout(layout_out)), build_opts)); +- +- // Configure kernel window +- Window win = calculate_max_window(*input->info(), Steps()); +- +- Coordinates coord; +- coord.set_num_dimensions(output->info()->num_dimensions()); +- output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); +- +- ICLKernel::configure_internal(win); +-} +- +-void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue) +-{ +- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); +- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); +- +- Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); +- +- // Setup output slice +- Window slice_out(slice_in); +- slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); +- slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); +- slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); +- slice_out.set(3, Window::Dimension(0, 0, 0)); +- +- do +- { +- unsigned int idx = 0; +- add_4D_tensor_argument(idx, _input, slice_in); +- add_4D_tensor_argument(idx, _output, slice_out); +- enqueue(queue, *this, slice_in); +- } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); +-} +diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp +deleted file mode 100644 +index 61999cb..0000000 +--- a/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp ++++ /dev/null +@@ -1,188 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2017-2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h" +- +-#include "arm_compute/core/CL/CLHelpers.h" +-#include "arm_compute/core/CL/CLKernelLibrary.h" +-#include "arm_compute/core/CL/CLValidate.h" +-#include "arm_compute/core/CL/ICLTensor.h" +-#include "arm_compute/core/Error.h" +-#include "arm_compute/core/Helpers.h" +-#include "arm_compute/core/Validate.h" +-#include "arm_compute/core/Window.h" +- +-using namespace arm_compute; +- +-CLTransposeConvLayerUpsampleKernel::CLTransposeConvLayerUpsampleKernel() +- : _input(nullptr), _output(nullptr), _inner_border(), _info() +-{ +-} +- +-Status CLTransposeConvLayerUpsampleKernel::validate(const ITensorInfo *input, +- const ITensorInfo *output, +- const BorderSize &inner_border, +- const PadStrideInfo &info) +-{ +- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); +- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, +- DataType::F32); +- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); +- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); +- +- const DataLayout data_layout = input->data_layout(); +- +- const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); +- const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); +- const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); +- +- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_w) == 0); +- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0); +- +- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c)); +- for (size_t i = 3; i < Coordinates::num_max_dimensions; ++i) +- { +- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i)); +- } +- +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.right > info.stride().first - 1, +- "inner_border_right must be smaller that stride_x"); +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.top > info.stride().second - 1, +- "inner_border_top must be smaller that stride_y"); +- +- return Status{}; +-} +- +-void CLTransposeConvLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output, +- const BorderSize &inner_border, +- const PadStrideInfo &info) +-{ +- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); +- +- _input = input; +- _output = output; +- _inner_border = inner_border; +- _info = info; +- +- // Perform validation step +- ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayerUpsampleKernel::validate( +- input->info(), output->info(), inner_border, info)); +- +- // Create kernel +- CLBuildOptions build_opts; +- build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); +- _kernel = static_cast( +- CLKernelLibrary::get().create_kernel("deconvolution_upsample", build_opts.options())); +- +- constexpr unsigned int num_elems_processed_per_iteration = 1; +- +- // Configure kernel window +- Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); +- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); +- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); +- +- ICLKernel::configure_internal(win); +-} +- +-void CLTransposeConvLayerUpsampleKernel::run(const Window &window, cl::CommandQueue &queue) +-{ +- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); +- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); +- +- const DataLayout data_layout = _input->info()->data_layout(); +- +- const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); +- const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); +- +- const int out_start_x = _info.pad_left(); +- const int out_end_x = _output->info()->dimension(idx_w) - _inner_border.right - +- _info.pad_right() + _info.stride().first - 1; +- const int out_step_x = _info.stride().first; +- +- const int out_start_y = _inner_border.top + _info.pad_top(); +- const int out_end_y = +- _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1; +- const int out_step_y = _info.stride().second; +- +- switch (data_layout) +- { +- case DataLayout::NCHW: +- { +- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); +- +- Window slice_out = collapsed.first_slice_window_3D(); +- slice_out.set(Window::DimX, Window::Dimension(out_start_x, out_end_x, out_step_x)); +- slice_out.set(Window::DimY, Window::Dimension(out_start_y, out_end_y, out_step_y)); +- +- Window slice_in = collapsed.first_slice_window_3D(); +- +- do +- { +- unsigned int idx = 0; +- add_3D_tensor_argument(idx, _input, slice_in); +- add_3D_tensor_argument(idx, _output, slice_out); +- enqueue(queue, *this, slice_out); +- } while (collapsed.slide_window_slice_3D(slice_in) && +- collapsed.slide_window_slice_3D(slice_out)); +- break; +- } +- case DataLayout::NHWC: +- { +- // NOTE: not collapsing in NHWC +- Window slice_out = window.first_slice_window_3D(); +- slice_out.set(Window::DimY, Window::Dimension(out_start_x, out_end_x, out_step_x)); +- slice_out.set(Window::DimZ, Window::Dimension(out_start_y, out_end_y, out_step_y)); +- +- Window slice_in = window.first_slice_window_3D(); +- +- do +- { +- unsigned int idx = 0; +- add_3D_tensor_argument(idx, _input, slice_in); +- add_3D_tensor_argument(idx, _output, slice_out); +- enqueue(queue, *this, slice_out); +- } while (window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out)); +- break; +- } +- default: +- ARM_COMPUTE_ERROR("Unsupported data layout"); +- } +-} +diff --git a/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp b/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp +deleted file mode 100644 +index 648afb3..0000000 +--- a/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp ++++ /dev/null +@@ -1,118 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2017-2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h" +- +-#include "arm_compute/core/Error.h" +-#include "arm_compute/core/Helpers.h" +-#include "arm_compute/core/ITensor.h" +-#include "arm_compute/core/TensorInfo.h" +-#include "arm_compute/core/Types.h" +-#include "arm_compute/core/Validate.h" +-#include "arm_compute/core/utils/misc/ShapeCalculator.h" +- +-#include +-#include +- +-namespace arm_compute +-{ +-CPPUpsampleKernelEx::CPPUpsampleKernelEx() : _input(nullptr), _output(nullptr), _info() {} +- +-bool CPPUpsampleKernelEx::is_parallelisable() const { return false; } +- +-void CPPUpsampleKernelEx::configure(const ITensor *input, ITensor *output, +- const PadStrideInfo &info) +-{ +- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); +- +- _input = input; +- _output = output; +- _info = info; +- +- // Configure kernel window +- Window win = calculate_max_window(*input->info(), Steps()); +- +- // The CPPUpsampleKernelEx doesn't need padding so update_window_and_padding() can be skipped +- Coordinates coord; +- coord.set_num_dimensions(output->info()->num_dimensions()); +- output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); +- +- ICPPKernel::configure(win); +-} +- +-void CPPUpsampleKernelEx::run(const Window &window, const ThreadInfo &info) +-{ +- ARM_COMPUTE_UNUSED(info); +- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); +- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); +- +- // Initialize _scaled_output buffer +- const int width_scaled = _output->info()->dimension(0); +- const int height_scaled = _output->info()->dimension(1); +- const int stride_x = _info.stride().first; +- const int stride_y = _info.stride().second; +- const int start_x = _info.pad_left(); +- const int start_y = _info.pad_top(); +- const int end_y = height_scaled - _info.pad_bottom(); +- const int end_x = width_scaled - _info.pad_top(); +- const size_t element_size = _input->info()->element_size(); +- +- // The fill value is normally 0, but for QASYMM8 the '0' corresponds to the offset +- const uint8_t fill_value = +- _output->info()->data_type() == DataType::QASYMM8 +- ? utility::clamp(_output->info()->quantization_info().uniform().offset) +- : 0; +- // Filling a value different than 0 works only for QASYMM8 datatype since we are filling 1byte +- // values in a buffer of uint8_ts +- std::fill_n(_output->buffer(), _output->info()->total_size(), fill_value); +- +- // Create window +- Window window_out(window); +- window_out.set(Window::DimX, Window::Dimension(start_x, end_x, stride_x)); +- window_out.set(Window::DimY, Window::Dimension(start_y, end_y, stride_y)); +- +- // Create iterators +- Iterator in(_input, window); +- Iterator out(_output, window_out); +- +- execute_window_loop( +- window, [&](const Coordinates &) { memcpy(out.ptr(), in.ptr(), element_size); }, in, out); +-} +-} // namespace arm_compute +diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp +deleted file mode 100644 +index fbb9dbc..0000000 +--- a/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp ++++ /dev/null +@@ -1,671 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2017-2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/core/NEON/kernels/NECastKernel.h" +- +-#include "arm_compute/core/AccessWindowStatic.h" +-#include "arm_compute/core/CPP/Validate.h" +-#include "arm_compute/core/Error.h" +-#include "arm_compute/core/Helpers.h" +-#include "arm_compute/core/NEON/NEAsymm.h" +-#include "arm_compute/core/NEON/wrapper/wrapper.h" +-#include "arm_compute/core/Utils.h" +-#include "arm_compute/core/Validate.h" +-#include "arm_compute/core/Window.h" +- +-#include +- +-namespace arm_compute +-{ +-namespace +-{ +-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, +- SubDataType input_subtype) +-{ +- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, +- DataType::QASYMM8, DataType::U32, +- DataType::S32, DataType::F32); +- ARM_COMPUTE_RETURN_ERROR_ON(input_subtype == SubDataType::BOOL && +- input->data_type() != DataType::U8); +- +- if (output->tensor_shape().total_size() > 0) +- { +- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output); +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, +- DataType::QASYMM8, DataType::U32, +- DataType::S32, DataType::F32); +- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); +- } +- +- return Status{}; +-} +- +-std::tuple validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +-{ +- // Configure kernel window +- Window win = calculate_max_window(*input, Steps()); +- +- // Output tensor auto initialization if not yet initialized +- auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32); +- +- // NECastKernel doesn't need padding so update_window_and_padding() can be skipped +- Coordinates coord; +- coord.set_num_dimensions(output->num_dimensions()); +- output->set_valid_region(ValidRegion(coord, output->tensor_shape())); +- +- return std::make_tuple(Status{}, win); +-} +- +-typedef struct bool8x16 +-{ +- uint8x16_t val; +-} bool8x16_t; +- +-static inline uint8x16_t vreinterpretq_u8_b8(bool8x16_t __a) { return (uint8x16_t)__a.val; } +- +-template inline ToV vcast(const FromV &v) { return v; } +-template <> inline uint8x16_t vcast(const bool8x16_t &v) +-{ +- const uint8x16_t vu8 = vreinterpretq_u8_b8(v); +- const uint8x16_t zero_uint8x16 = vdupq_n_u8(0); +- uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16); +- return vshrq_n_u8(mask, 7); // true -> 1, false -> 0 +-} +- +-template <> inline uint32x4x4_t vcast(const bool8x16_t &v) +-{ +- const uint8x16_t vu8 = vreinterpretq_u8_b8(v); +- const uint8x16_t zero_uint8x16 = vdupq_n_u8(0); +- uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16); +- uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0 +- +- const uint32x4x4_t ret = {{ +- vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb)))), +- vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb)))), +- vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb)))), +- vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb)))), +- }}; +- +- return ret; +-} +- +-template <> inline int32x4x4_t vcast(const bool8x16_t &v) +-{ +- const uint8x16_t vu8 = vreinterpretq_u8_b8(v); +- const uint8x16_t zero_uint8x16 = vdupq_n_u8(0); +- uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16); +- uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0 +- +- const int32x4x4_t ret = {{ +- vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))), +- vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))), +- vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))), +- vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))), +- }}; +- +- return ret; +-} +- +-template <> inline float32x4x4_t vcast(const bool8x16_t &v) +-{ +- const uint8x16_t vu8 = vreinterpretq_u8_b8(v); +- const uint8x16_t zero_uint8x16 = vdupq_n_u8(0); +- uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16); +- uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0 +- +- const float32x4x4_t ret = {{ +- vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))), +- vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))), +- vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))), +- vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))), +- }}; +- +- return ret; +-} +- +-template <> inline uint32x4x4_t vcast(const uint8x16_t &v) +-{ +- const uint32x4x4_t ret = {{ +- vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v)))), +- vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v)))), +- vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v)))), +- vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v)))), +- }}; +- +- return ret; +-} +- +-template <> inline int32x4x4_t vcast(const uint8x16_t &v) +-{ +- const int32x4x4_t ret = {{ +- vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))), +- vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))), +- vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))), +- vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))), +- }}; +- +- return ret; +-} +- +-template <> inline float32x4x4_t vcast(const uint8x16_t &v) +-{ +- const float32x4x4_t ret = {{ +- vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))), +- vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))), +- vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))), +- vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))), +- }}; +- +- return ret; +-} +- +-template <> inline uint8x16_t vcast(const int32x4x4_t &v) +-{ +- // Saturate cast +- return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[0]), vqmovun_s32(v.val[1]))), +- vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[2]), vqmovun_s32(v.val[3])))); +-} +- +-template <> inline uint32x4x4_t vcast(const int32x4x4_t &v) +-{ +- // Saturate cast +- const uint32x4x4_t ret = {{ +- vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[0]))), +- vqmovun_s64(vmovl_s32(vget_high_s32(v.val[0])))), +- vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[1]))), +- vqmovun_s64(vmovl_s32(vget_high_s32(v.val[1])))), +- vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[2]))), +- vqmovun_s64(vmovl_s32(vget_high_s32(v.val[2])))), +- vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[3]))), +- vqmovun_s64(vmovl_s32(vget_high_s32(v.val[3])))), +- }}; +- +- return ret; +-} +- +-template <> inline float32x4x4_t vcast(const int32x4x4_t &v) +-{ +- const float32x4x4_t ret = {{ +- vcvtq_f32_s32(v.val[0]), vcvtq_f32_s32(v.val[1]), vcvtq_f32_s32(v.val[2]), +- vcvtq_f32_s32(v.val[3]), +- }}; +- +- return ret; +-} +- +-template <> inline uint8x16_t vcast(const uint32x4x4_t &v) +-{ +- return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[0]), vqmovn_u32(v.val[1]))), +- vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[2]), vqmovn_u32(v.val[3])))); +-} +- +-template <> inline int32x4x4_t vcast(const uint32x4x4_t &v) +-{ +- const int32x4x4_t ret = {{ +- vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[0])))), +- vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[0]))))), +- vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[1])))), +- vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[1]))))), +- vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[2])))), +- vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[2]))))), +- vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[3])))), +- vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[3]))))), +- }}; +- +- return ret; +-} +- +-template <> inline float32x4x4_t vcast(const uint32x4x4_t &v) +-{ +- const float32x4x4_t ret = {{ +- vcvtq_f32_u32(v.val[0]), vcvtq_f32_u32(v.val[1]), vcvtq_f32_u32(v.val[2]), +- vcvtq_f32_u32(v.val[3]), +- }}; +- +- return ret; +-} +- +-template <> inline uint8x16_t vcast(const float32x4x4_t &v) +-{ +- // Saturate cast +- return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[0])), +- vqmovun_s32(vcvtq_s32_f32(v.val[1])))), +- vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[2])), +- vqmovun_s32(vcvtq_s32_f32(v.val[3]))))); +-} +- +-template <> inline uint32x4x4_t vcast(const float32x4x4_t &v) +-{ +- const uint32x4x4_t ret = {{ +- vcvtq_u32_f32(v.val[0]), vcvtq_u32_f32(v.val[1]), vcvtq_u32_f32(v.val[2]), +- vcvtq_u32_f32(v.val[3]), +- }}; +- +- return ret; +-} +- +-template <> inline int32x4x4_t vcast(const float32x4x4_t &v) +-{ +- const int32x4x4_t ret = {{ +- vcvtq_s32_f32(v.val[0]), vcvtq_s32_f32(v.val[1]), vcvtq_s32_f32(v.val[2]), +- vcvtq_s32_f32(v.val[3]), +- }}; +- +- return ret; +-} +- +-template struct cast_vector; +-template <> struct cast_vector +-{ +- using type = bool8x16_t; +-}; +-template <> struct cast_vector +-{ +- using type = uint8x16_t; +-}; +-template <> struct cast_vector +-{ +- using type = uint32x4x4_t; +-}; +-template <> struct cast_vector +-{ +- using type = int32x4x4_t; +-}; +-template <> struct cast_vector +-{ +- using type = float32x4x4_t; +-}; +- +-template inline void store_result(T *ptr, const typename cast_vector::type &v) +-{ +- wrapper::vstore(ptr, v.val[0]); +- wrapper::vstore(ptr + 4, v.val[1]); +- wrapper::vstore(ptr + 8, v.val[2]); +- wrapper::vstore(ptr + 12, v.val[3]); +-} +- +-template <> inline void store_result(uint8_t *ptr, const uint8x16_t &v) +-{ +- wrapper::vstore(ptr, v); +-} +- +-inline bool8x16_t vloadq(const bool *ptr) +-{ +- bool8x16_t ret; +- ret.val = wrapper::vloadq(reinterpret_cast(ptr)); +- return ret; +-} +- +-template inline typename cast_vector::type load_input(const T *ptr) +-{ +- return wrapper::vloadq(ptr); +-} +- +-template <> inline typename cast_vector::type load_input(const bool *ptr) +-{ +- return vloadq(ptr); +-} +- +-template <> inline typename cast_vector::type load_input(const uint32_t *ptr) +-{ +- return vld4q_u32(ptr); +-} +- +-template <> inline typename cast_vector::type load_input(const int32_t *ptr) +-{ +- return vld4q_s32(ptr); +-} +- +-template <> inline typename cast_vector::type load_input(const float *ptr) +-{ +- return vld4q_f32(ptr); +-} +- +-template inline T get_value(const T *ptr) { return *ptr; } +- +-template <> inline bool get_value(const bool *ptr) +-{ +- bool ret = (*ptr != 0); +- return ret; +-} +- +-template void run_cast(const ITensor *input, ITensor *output, const Window &window) +-{ +- const int window_step_x = 16; +- const auto window_start_x = static_cast(window.x().start()); +- const auto window_end_x = static_cast(window.x().end()); +- +- // Collapse window and reset first dimension to handle tail calculations manually +- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); +- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); +- +- // Create iterators +- Iterator in(input, win_collapsed); +- Iterator out(output, win_collapsed); +- +-#ifdef __aarch64__ +- constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; +-#else //__aarch64__ +- constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO; +-#endif //__aarch64__ +- +- execute_window_loop( +- win_collapsed, +- [&](const Coordinates &) { +- const auto in_ptr = reinterpret_cast(in.ptr()); +- +- int x = window_start_x; +- for (; x <= (window_end_x - window_step_x); x += window_step_x) +- { +- using from_vector = typename cast_vector::type; +- const from_vector vin = load_input(in_ptr + x); +- +- switch (output->info()->data_type()) +- { +- case DataType::U8: +- { +- using to_vector = typename cast_vector::type; +- const to_vector vout = vcast(vin); +- store_result(reinterpret_cast(out.ptr()) + x, vout); +- break; +- } +- case DataType::QASYMM8: +- { +- using to_vector = typename cast_vector::type; +- const UniformQuantizationInfo &qinfo_out = +- output->info()->quantization_info().uniform(); +- const auto vf = vcast(vin); +- const auto vout = vquantize(vf, qinfo_out); +- store_result(reinterpret_cast(out.ptr()) + x, vout); +- break; +- } +- case DataType::U32: +- { +- using to_vector = typename cast_vector::type; +- const to_vector vout = vcast(vin); +- store_result(reinterpret_cast(out.ptr()) + x, vout); +- break; +- } +- case DataType::S32: +- { +- using to_vector = typename cast_vector::type; +- const to_vector vout = vcast(vin); +- store_result(reinterpret_cast(out.ptr()) + x, vout); +- break; +- } +- case DataType::F32: +- { +- using to_vector = typename cast_vector::type; +- const to_vector vout = vcast(vin); +- store_result(reinterpret_cast(out.ptr()) + x, vout); +- break; +- } +- default: +- ARM_COMPUTE_ERROR("Unsupported data type."); +- } +- } +- +- // Compute left-over elements +- for (; x < window_end_x; ++x) +- { +- FromT val = get_value(in_ptr + x); +- switch (output->info()->data_type()) +- { +- case DataType::U8: +- { +- *(reinterpret_cast(out.ptr()) + x) = static_cast(val); +- break; +- } +- case DataType::QASYMM8: +- { +- const QuantizationInfo &qinfo_out = output->info()->quantization_info(); +- const auto qval = +- quantize_qasymm8(static_cast(val), qinfo_out, rounding_policy); +- *(reinterpret_cast(out.ptr()) + x) = qval; +- break; +- } +- case DataType::U32: +- { +- *(reinterpret_cast(out.ptr()) + x) = static_cast(val); +- break; +- } +- case DataType::S32: +- { +- *(reinterpret_cast(out.ptr()) + x) = static_cast(val); +- break; +- } +- case DataType::F32: +- { +- *(reinterpret_cast(out.ptr()) + x) = static_cast(val); +- break; +- } +- default: +- ARM_COMPUTE_ERROR("Unsupported data type."); +- } +- } +- }, +- in, out); +-} +- +-void run_cast_qasymm8(const ITensor *input, ITensor *output, const Window &window) +-{ +- const int window_step_x = 16; +- const auto window_start_x = static_cast(window.x().start()); +- const auto window_end_x = static_cast(window.x().end()); +- +- // Collapse window and reset first dimension to handle tail calculations manually +- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); +- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); +- +- // Create iterators +- Iterator in(input, win_collapsed); +- Iterator out(output, win_collapsed); +- +-#ifdef __aarch64__ +- constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; +-#else //__aarch64__ +- constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO; +-#endif //__aarch64__ +- const auto &qinfo_in = input->info()->quantization_info().uniform(); +- const auto &qinfo_out = output->info()->quantization_info().uniform(); +- +- execute_window_loop( +- win_collapsed, +- [&](const Coordinates &) { +- const auto in_ptr = reinterpret_cast(in.ptr()); +- +- int x = window_start_x; +- for (; x <= (window_end_x - window_step_x); x += window_step_x) +- { +- using from_vector = typename cast_vector::type; +- const auto vf = wrapper::vloadq(in_ptr + x); +- const auto vin = vdequantize(vf, qinfo_in); +- switch (output->info()->data_type()) +- { +- case DataType::U8: +- { +- using to_vector = typename cast_vector::type; +- const to_vector vout = vcast(vin); +- store_result(reinterpret_cast(out.ptr()) + x, vout); +- break; +- } +- case DataType::QASYMM8: +- { +- using to_vector = typename cast_vector::type; +- const auto vf = vcast(vin); +- const auto vout = vquantize(vf, qinfo_out); +- store_result(reinterpret_cast(out.ptr()) + x, vout); +- break; +- } +- case DataType::U32: +- { +- using to_vector = typename cast_vector::type; +- const to_vector vout = vcast(vin); +- store_result(reinterpret_cast(out.ptr()) + x, vout); +- break; +- } +- case DataType::S32: +- { +- using to_vector = typename cast_vector::type; +- const to_vector vout = vcast(vin); +- store_result(reinterpret_cast(out.ptr()) + x, vout); +- break; +- } +- case DataType::F32: +- { +- using to_vector = typename cast_vector::type; +- const to_vector vout = vcast(vin); +- store_result(reinterpret_cast(out.ptr()) + x, vout); +- break; +- } +- default: +- ARM_COMPUTE_ERROR("Unsupported data type."); +- } +- } +- +- // Compute left-over elements +- for (; x < window_end_x; ++x) +- { +- qasymm8_t qval_in = *(in_ptr + x); +- const auto val = dequantize_qasymm8(qval_in, qinfo_in); +- +- switch (output->info()->data_type()) +- { +- case DataType::U8: +- { +- *(reinterpret_cast(out.ptr()) + x) = static_cast(val); +- break; +- } +- case DataType::QASYMM8: +- { +- const auto qval_out = quantize_qasymm8(val, qinfo_out, rounding_policy); +- *(reinterpret_cast(out.ptr()) + x) = qval_out; +- break; +- } +- case DataType::U32: +- { +- *(reinterpret_cast(out.ptr()) + x) = static_cast(val); +- break; +- } +- case DataType::S32: +- { +- *(reinterpret_cast(out.ptr()) + x) = static_cast(val); +- break; +- } +- case DataType::F32: +- { +- *(reinterpret_cast(out.ptr()) + x) = static_cast(val); +- break; +- } +- default: +- ARM_COMPUTE_ERROR("Unsupported data type."); +- } +- } +- }, +- in, out); +-} +-} // namespace +- +-NECastKernel::NECastKernel() : _input(nullptr), _output(nullptr), _input_subtype(SubDataType::NONE) +-{ +-} +- +-void NECastKernel::configure(const ITensor *input, ITensor *output, SubDataType input_subtype) +-{ +- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); +- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), input_subtype)); +- +- _input = input; +- _output = output; +- _input_subtype = input_subtype; +- +- // Configure kernel window +- auto win_config = validate_and_configure_window(input->info(), output->info()); +- +- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); +- +- INEKernel::configure(std::get<1>(win_config)); +-} +- +-Status NECastKernel::validate(const ITensorInfo *input, const ITensorInfo *output, +- SubDataType input_subtype) +-{ +- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, input_subtype)); +- ARM_COMPUTE_RETURN_ON_ERROR( +- std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); +- return Status{}; +-} +- +-void NECastKernel::run(const Window &window, const ThreadInfo &info) +-{ +- ARM_COMPUTE_UNUSED(info); +- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); +- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); +- +- switch (_input->info()->data_type()) +- { +- case DataType::U8: +- if (_input_subtype == SubDataType::BOOL) +- { +- run_cast(_input, _output, window); +- } +- else +- { +- run_cast(_input, _output, window); +- } +- break; +- case DataType::QASYMM8: +- run_cast_qasymm8(_input, _output, window); +- break; +- case DataType::U32: +- run_cast(_input, _output, window); +- break; +- case DataType::S32: +- run_cast(_input, _output, window); +- break; +- case DataType::F32: +- run_cast(_input, _output, window); +- break; +- default: +- ARM_COMPUTE_ERROR("Unsupported data type."); +- } +-} +-} // namespace arm_compute +diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp +deleted file mode 100644 +index 95e269d..0000000 +--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp ++++ /dev/null +@@ -1,181 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h" +- +-#include "arm_compute/core/Helpers.h" +-#include "arm_compute/core/ITensor.h" +-#include "arm_compute/core/NEON/wrapper/wrapper.h" +-#include "arm_compute/core/Types.h" +-#include "arm_compute/core/Validate.h" +-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" +-#include +-#include +- +-using namespace arm_compute::misc::shape_calculator; +- +-namespace arm_compute +-{ +-namespace +-{ +-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape) +-{ +- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); +- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); +- ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 2); +- +- const DataLayout data_layout = input->data_layout(); +- const int idx_channel = +- get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); +- ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) != +- 0); +- // Validate output if initialized +- if (output->total_size() != 0) +- { +- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); +- const int idx_height = +- get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); +- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != +- (block_shape * input->tensor_shape()[idx_width])); +- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != +- (block_shape * input->tensor_shape()[idx_height])); +- ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); +- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); +- } +- +- return Status{}; +-} +-} // namespace +- +-NEDepthToSpaceLayerKernelEx::NEDepthToSpaceLayerKernelEx() +- : _input(nullptr), _output(nullptr), _block_shape() +-{ +-} +- +-void NEDepthToSpaceLayerKernelEx::configure(const ITensor *input, ITensor *output, +- int32_t block_shape) +-{ +- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); +- TensorShape output_shape = compute_depth_to_space_shape_ex(input->info(), block_shape); +- // Output auto inizialitation if not yet initialized +- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); +- +- // Perform validation step +- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape)); +- +- _input = input; +- _output = output; +- _block_shape = block_shape; +- +- // Configure kernel window +- Window win = calculate_max_window(*input->info(), Steps()); +- ICPPKernel::configure(win); +-} +- +-Status NEDepthToSpaceLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output, +- int32_t block_shape) +-{ +- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); +- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape)); +- return Status{}; +-} +- +-void NEDepthToSpaceLayerKernelEx::run(const Window &window, const ThreadInfo &info) +-{ +- ARM_COMPUTE_UNUSED(info); +- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); +- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); +- +- const int idx_channel = +- get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::CHANNEL); +- const int depth_size = _input->info()->dimension(idx_channel); +- const int r = (depth_size / (_block_shape * _block_shape)); +- const int element_size = _input->info()->element_size(); +- +- Window slice_out = window.first_slice_window_3D(); +- +- // The slice_out slice does not move +- slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); +- slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); +- slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); +- +- // Main loop for NCHW and NHWC +- if (_input->info()->data_layout() == DataLayout::NCHW) +- { +- Window slice_in = window.first_slice_window_2D(); +- do +- { +- Iterator in(_input, slice_in); +- execute_window_loop(slice_in, +- [&](const Coordinates &id) { +- const int x = id.x(); +- const int y = id.y(); +- +- const int z = id.z() % r; +- const int out_x = x * _block_shape + (id.z() / r) % _block_shape; +- const int out_y = y * _block_shape + (id.z() / r) / _block_shape; +- Coordinates output_coords{out_x, out_y, z, id[3]}; +- memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size); +- }, +- in); +- } while (window.slide_window_slice_2D(slice_in)); +- } +- else +- { +- Window slice_in = window.first_slice_window_3D(); +- do +- { +- Iterator in(_input, slice_in); +- execute_window_loop(slice_in, +- [&](const Coordinates &id) { +- const int x = id.y(); +- const int y = id.z(); +- +- const int z = id.x() % r; +- const int out_x = x * _block_shape + (id.x() / r) % _block_shape; +- const int out_y = y * _block_shape + (id.x() / r) / _block_shape; +- Coordinates output_coords{z, out_x, out_y, id[3]}; +- memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size); +- }, +- in); +- } while (window.slide_window_slice_3D(slice_in)); +- } +-} +-} // namespace arm_compute +diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp +deleted file mode 100644 +index 200fc4f..0000000 +--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp ++++ /dev/null +@@ -1,221 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2018-2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h" +- +-#include "arm_compute/core/CPP/Validate.h" +-#include "arm_compute/core/Error.h" +-#include "arm_compute/core/Helpers.h" +-#include "arm_compute/core/IAccessWindow.h" +-#include "arm_compute/core/ITensor.h" +-#include "arm_compute/core/NEON/NEAsymm.h" +-#include "arm_compute/core/NEON/NEFixedPoint.h" +-#include "arm_compute/core/NEON/wrapper/wrapper.h" +-#include "arm_compute/core/TensorInfo.h" +-#include "arm_compute/core/Validate.h" +- +-#include +-#include +-#include +-#include +-#include +- +-namespace arm_compute +-{ +-class Coordinates; +- +-namespace +-{ +-template +-inline ScalarType elementwise_op_scalar(const ScalarType &a) +-{ +- switch (op) +- { +- case ElementWiseUnaryEx::NEG: +- return -a; +- default: +- ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); +- } +-} +- +-template +-inline VectorType elementwise_op(const VectorType &a) +-{ +- switch (op) +- { +- case ElementWiseUnaryEx::NEG: +- return wrapper::vneg(a); +- default: +- ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); +- } +-} +- +-template +-void elementwise_op(const ITensor *in, ITensor *out, const Window &window) +-{ +- const int window_step_x = 16 / sizeof(ScalarType); +- const auto window_start_x = static_cast(window.x().start()); +- const auto window_end_x = static_cast(window.x().end()); +- +- Window win = window; +- win.set(Window::DimX, Window::Dimension(0, 1, 1)); +- +- Iterator input(in, win); +- Iterator output(out, win); +- +- execute_window_loop(win, +- [&](const Coordinates &) { +- auto output_ptr = reinterpret_cast(output.ptr()); +- const auto input_ptr = reinterpret_cast(input.ptr()); +- +- int x = window_start_x; +- for (; x <= window_end_x - window_step_x; x += window_step_x) +- { +- wrapper::vstore(output_ptr + x, +- elementwise_op(wrapper::vloadq(input_ptr + x))); +- } +- for (; x < window_end_x; ++x) +- { +- *(output_ptr + x) = elementwise_op_scalar(*(input_ptr + x)); +- } +- }, +- input, output); +-} +- +-template +-std::function +-configure_func(const ITensor *input, ITensor *output) +-{ +- std::string function_to_call("op_"); +- function_to_call += string_from_data_type(input->info()->data_type()) + "_"; +- function_to_call += string_from_data_type(output->info()->data_type()); +- +- static std::map +- map_function = { +- {"op_F32_F32", &elementwise_op}, {"op_S32_S32", &elementwise_op}, +- }; +-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +- map_function["op_F16_F16"] = &elementwise_op; +-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ +- +- auto it = map_function.find(function_to_call); +- +- if (it != map_function.end()) +- { +- auto func = it->second; +- return [func](const ITensor *input, ITensor *output, const Window &window) { +- func(input, output, window); +- }; +- } +- return nullptr; +-} +-} // namespace +- +-NEElementwiseUnaryKernelEx::NEElementwiseUnaryKernelEx() +- : _function(nullptr), _input(nullptr), _output(nullptr) +-{ +-} +- +-void NEElementwiseUnaryKernelEx::configure(ElementWiseUnaryEx op, const ITensor *input, +- ITensor *output) +-{ +- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *output->info())); +- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); +- +- // Configure kernel window +- const std::pair broadcast_pair = +- ITensorInfo::broadcast_shape_and_valid_region(*input->info()); +- const TensorShape &out_shape = broadcast_pair.first; +- const ValidRegion &valid_region = broadcast_pair.second; +- +- // Auto initialize output if not initialized +- auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type()); +- +- Window win = calculate_max_window(valid_region); +- +- _input = input; +- _output = output; +- +- INEKernel::configure(win); +- +- switch (op) +- { +- case ElementWiseUnaryEx::NEG: +- _function = configure_func(input, output); +- break; +- default: +- ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); +- } +-} +- +-Status NEElementwiseUnaryKernelEx::validate_arguments(const ITensorInfo &input, +- const ITensorInfo &output) +-{ +- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input); +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::F16, DataType::F32, +- DataType::S32); +- +- // Validate in case of configured output +- if (output.total_size() > 0) +- { +- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output); +- } +- +- return Status{}; +-} +- +-Status NEElementwiseUnaryKernelEx::validate(ElementWiseUnaryEx op, const ITensorInfo *input, +- const ITensorInfo *output) +-{ +- ARM_COMPUTE_UNUSED(op); +- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); +- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output)); +- return Status{}; +-} +- +-void NEElementwiseUnaryKernelEx::run(const Window &window, const ThreadInfo &info) +-{ +- ARM_COMPUTE_UNUSED(info); +- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); +- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); +- ARM_COMPUTE_ERROR_ON(_function == nullptr); +- _function(_input, _output, window); +-} +-} // namespace arm_compute +diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp +deleted file mode 100644 +index 641641b..0000000 +--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp ++++ /dev/null +@@ -1,291 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2017-2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h" +- +-#include "arm_compute/core/ITensor.h" +-#include "arm_compute/core/NEON/NEAsymm.h" +-#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h" +-#include "arm_compute/core/NEON/wrapper/wrapper.h" +-#include "arm_compute/core/TensorInfo.h" +-#include "arm_compute/core/Window.h" +- +-#include +- +-using namespace arm_compute; +-namespace +-{ +- +-/** Conditional element-wise operations */ +-enum class ConditionalOperation +-{ +- PRELU, /**< (x * y) for x < 0, x for x >= 0 */ +-}; +- +-template +-inline ScalarType elementwise_conditional_op_scalar(const ScalarType &a, const ScalarType &b) +-{ +- auto res = ScalarType(0); +- +- switch (op) +- { +- case ConditionalOperation::PRELU: +- res = a < 0 ? a * b : a; +- break; +- default: +- ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); +- } +- return res; +-} +- +-template +-inline uint8_t elementwise_conditional_op_quantized_scalar(const float &a, const float &b, +- QuantizationInfo qinfo) +-{ +- return quantize_qasymm8(elementwise_conditional_op_scalar(a, b), qinfo, +- RoundingPolicy::TO_NEAREST_UP); +-} +- +-template +-inline VectorType elementwise_conditional_op(const VectorType &a, const VectorType &b) +-{ +- VectorType res = {0, 0, 0, 0}; +- VectorType const_0 = {0, 0, 0, 0}; +- +- switch (op) +- { +- case ConditionalOperation::PRELU: +- res = wrapper::vbsl(wrapper::vcgt(a, const_0), a, wrapper::vmul(a, b)); +- ; +- break; +- default: +- ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); +- } +- return res; +-} +- +-template +-inline float32x4x4_t elementwise_conditional_op(const float32x4x4_t &a, const float32x4x4_t &b) +-{ +- float32x4x4_t out = {{ +- elementwise_conditional_op(a.val[0], b.val[0]), +- elementwise_conditional_op(a.val[1], b.val[1]), +- elementwise_conditional_op(a.val[2], b.val[2]), +- elementwise_conditional_op(a.val[3], b.val[3]), +- }}; +- return out; +-} +- +-template +-inline VectorType elementwise_conditional_op_broadcast(const VectorType &a, +- const ScalarType &broadcast_value, +- const bool reorder) +-{ +- VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag()); +- return elementwise_conditional_op(reorder ? broadcast_vector : a, +- reorder ? a : broadcast_vector); +-} +- +-template +-inline int elementwise_conditional_op_loop(int window_start_x, int window_end_x, int window_step_x, +- const ScalarType *input1_ptr, +- const ScalarType *input2_ptr, ScalarType *output_ptr) +-{ +- int x = window_start_x; +- for (; x <= (window_end_x - window_step_x); x += window_step_x) +- { +- const auto a = wrapper::vloadq(input1_ptr + x); +- const auto b = wrapper::vloadq(input2_ptr + x); +- wrapper::vstore(output_ptr + x, elementwise_conditional_op(a, b)); +- } +- return x; +-} +- +-template +-inline int elementwise_conditional_op_quantized_loop(int window_start_x, int window_end_x, +- int window_step_x, const uint8_t *input1_ptr, +- const uint8_t *input2_ptr, uint8_t *output_ptr, +- int32x4_t voffset1, int32x4_t voffset2, +- float32x4_t vscale1, float32x4_t vscale2, +- float32x4_t voffseto, float32x4_t invvscaleo) +-{ +- int x = window_start_x; +- for (; x <= (window_end_x - window_step_x); x += window_step_x) +- { +- // Get inputs and compute output +- const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1); +- const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2); +- const float32x4x4_t rf = elementwise_conditional_op(af, bf); +- store_quantized(output_ptr + x, rf, voffseto, invvscaleo); +- } +- return x; +-} +- +-template +-inline int elementwise_conditional_op_broadcast_loop(int window_start_x, int window_end_x, +- int window_step_x, +- const ScalarType *non_broadcast_input_ptr, +- const ScalarType &broadcast_value, +- ScalarType *output_ptr, const bool reorder) +-{ +- int x = window_start_x; +- for (; x <= (window_end_x - window_step_x); x += window_step_x) +- { +- const auto a = wrapper::vloadq((non_broadcast_input_ptr + x)); +- wrapper::vstore(output_ptr + x, +- elementwise_conditional_op_broadcast(a, broadcast_value, reorder)); +- } +- return x; +-} +- +-template +-inline int elementwise_conditional_op_quantized_broadcast_loop( +- int window_start_x, int window_end_x, int window_step_x, const uint8_t *non_broadcast_input_ptr, +- float32x4x4_t broadcast_vector, uint8_t *output_ptr, int32x4_t voffset_non_broadcast, +- float32x4_t vscale_non_broadcast, float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) +-{ +- int x = window_start_x; +- for (; x <= (window_end_x - window_step_x); x += window_step_x) +- { +- const float32x4x4_t af = +- load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); +- const float32x4x4_t rf = elementwise_conditional_op(reorder ? broadcast_vector : af, +- reorder ? af : broadcast_vector); +- store_quantized(output_ptr + x, rf, voffseto, invvscaleo); +- } +- return x; +-} +- +-template +-void elementwise_conditional_op(const ITensor *in1, const ITensor *in2, ITensor *out, +- const Window &window) +-{ +- elementwise_op(in1, in2, out, window, &elementwise_conditional_op_scalar, +- &elementwise_conditional_op_broadcast_loop, +- &elementwise_conditional_op_loop); +-} +- +-template +-void elementwise_conditional_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, +- const Window &window) +-{ +- elementwise_op_quantized(in1, in2, out, window, &elementwise_conditional_op_quantized_scalar, +- &elementwise_conditional_op_quantized_broadcast_loop, +- &elementwise_conditional_op_quantized_loop); +-} +-} // namespace +- +-NEPReLUKernel::NEPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {} +- +-void NEPReLUKernel::configure(const ITensor *input, const ITensor *alpha, ITensor *output) +-{ +- ARM_COMPUTE_ERROR_ON_NULLPTR(input, alpha, output); +- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *alpha->info(), *output->info())); +- +- // Configure kernel window +- const std::pair broadcast_pair = +- ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info()); +- const TensorShape &out_shape = broadcast_pair.first; +- const ValidRegion &valid_region = broadcast_pair.second; +- +- // Auto initialize output if not initialized +- auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type()); +- +- Window win = calculate_max_window(valid_region); +- +- _input = input; +- _alpha = alpha; +- _output = output; +- INEKernel::configure(win); +-} +- +-void NEPReLUKernel::run(const Window &window, const ThreadInfo &info) +-{ +- ARM_COMPUTE_UNUSED(info); +- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); +- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); +- +- if (_input->info()->data_type() == DataType::F32) +- { +- elementwise_conditional_op(_input, _alpha, +- _output, window); +- } +- else if (_input->info()->data_type() == DataType::QASYMM8) +- { +- elementwise_conditional_op_quantized(_input, _alpha, _output, +- window); +- } +- else +- { +- ARM_COMPUTE_ERROR("Wrong Type"); +- } +-} +- +-Status NEPReLUKernel::validate_arguments(const ITensorInfo &input, const ITensorInfo &alpha, +- const ITensorInfo &output) +-{ +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F32); +- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &alpha, &output); +- +- const TensorShape out_shape = +- TensorShape::broadcast_shape(input.tensor_shape(), alpha.tensor_shape()); +- +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, +- "Inputs are not broadcast compatible"); +- +- // Checks performed when output is configured +- if (output.total_size() > 0) +- { +- ARM_COMPUTE_RETURN_ERROR_ON_MSG( +- detail::have_different_dimensions(out_shape, output.tensor_shape(), 0), +- "Wrong shape for output"); +- } +- +- return Status{}; +-} +- +-Status NEPReLUKernel::validate(const ITensorInfo *input, const ITensorInfo *alpha, +- const ITensorInfo *output) +-{ +- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, alpha, output); +- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *alpha, *output)); +- +- return Status{}; +-} +diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp +index 6ba0f1f..5841f1d 100644 +--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp ++++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp +@@ -64,7 +64,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0); +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S8); ++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16, + DataType::F32); +diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp +deleted file mode 100644 +index 44feb20..0000000 +--- a/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp ++++ /dev/null +@@ -1,181 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h" +- +-#include "arm_compute/core/Helpers.h" +-#include "arm_compute/core/ITensor.h" +-#include "arm_compute/core/NEON/wrapper/wrapper.h" +-#include "arm_compute/core/Types.h" +-#include "arm_compute/core/Validate.h" +-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" +-#include +-#include +- +-using namespace arm_compute::misc::shape_calculator; +- +-namespace arm_compute +-{ +-namespace +-{ +-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape) +-{ +- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); +- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); +- +- ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1); +- +- // Validate output if initialized +- if (output->total_size() != 0) +- { +- const DataLayout data_layout = input->data_layout(); +- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); +- const int idx_height = +- get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); +- const int idx_channel = +- get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); +- const int idx_batch = +- get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); +- ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_width] % block_shape != 0); +- ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_height] % block_shape != 0); +- ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] != +- output->tensor_shape()[idx_batch]); +- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_channel] % (block_shape * block_shape) != +- 0); +- ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size() != +- output->tensor_shape().total_size()); +- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); +- } +- +- return Status{}; +-} +-} // namespace +- +-NESpaceToDepthLayerKernelEx::NESpaceToDepthLayerKernelEx() +- : _input(nullptr), _output(nullptr), _block_shape() +-{ +-} +- +-void NESpaceToDepthLayerKernelEx::configure(const ITensor *input, ITensor *output, +- int32_t block_shape) +-{ +- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); +- +- TensorShape output_shape = compute_space_to_depth_shape_ex(input->info(), block_shape); +- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type()); +- +- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape)); +- +- _input = input; +- _block_shape = block_shape; +- _output = output; +- +- // Configure kernel window +- Window win = calculate_max_window(*output->info(), Steps()); +- INEKernel::configure(win); +-} +- +-Status NESpaceToDepthLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output, +- int32_t block_shape) +-{ +- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape)); +- return Status{}; +-} +- +-void NESpaceToDepthLayerKernelEx::run(const Window &window, const ThreadInfo &info) +-{ +- ARM_COMPUTE_UNUSED(info); +- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); +- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); +- +- const DataLayout data_layout = _input->info()->data_layout(); +- const int channel_idx = +- get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); +- const int element_size = _input->info()->element_size(); +- +- const size_t channel_size = _input->info()->dimension(channel_idx); +- +- Window slice_out = window.first_slice_window_3D(); +- +- int batch_id = 0; +- +- // Main loop for NCHW and NHWC +- if (_output->info()->data_layout() == DataLayout::NCHW) +- { +- do +- { +- Iterator out(_output, slice_out); +- execute_window_loop(slice_out, +- [&](const Coordinates &id) { +- const size_t channel_id = id.z(); +- const size_t in_x = +- id.x() * _block_shape + (channel_id / channel_size) % _block_shape; +- const size_t in_y = +- id.y() * _block_shape + (channel_id / channel_size) / _block_shape; +- const int z = channel_id % channel_size; +- Coordinates input_coords{in_x, in_y, z, batch_id}; +- memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); +- }, +- out); +- ++batch_id; +- } while (window.slide_window_slice_3D(slice_out)); +- } +- else +- { +- do +- { +- Iterator out(_output, slice_out); +- execute_window_loop(slice_out, +- [&](const Coordinates &id) { +- const size_t channel_id = id.x(); +- const size_t in_x = +- id.y() * _block_shape + (channel_id / channel_size) % _block_shape; +- const size_t in_y = +- id.z() * _block_shape + (channel_id / channel_size) / _block_shape; +- const int z = channel_id % channel_size; +- Coordinates input_coords{z, in_x, in_y, batch_id}; +- memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); +- }, +- out); +- ++batch_id; +- } while (window.slide_window_slice_3D(slice_out)); +- } +-} +-} // namespace arm_compute +diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp +deleted file mode 100644 +index 2d379cf..0000000 +--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp ++++ /dev/null +@@ -1,144 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2017 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/runtime/CL/functions/CLArgOperation.h" +- +-#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h" +-#include "arm_compute/runtime/CL/CLScheduler.h" +- +-namespace arm_compute +-{ +- +-CLArgOperation::CLArgOperation() +-{ +- // DO NOTHING +-} +- +-void CLArgOperation::configure(ICLTensor *input, ICLTensor *output, std::vector axis, +- ArgOperation op) +-{ +- ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), axis, output->info(), op)); +- _input = input; +- _output = output; +- _axis = axis; +- _arg_op = op; +- // NOTE The argminmax_axis must have no duplication. +- _num_of_kernels = axis.size(); +- const size_t num_of_interm_tensors = _num_of_kernels - 1; +- +- _interm_tensors = arm_compute::support::cpp14::make_unique(num_of_interm_tensors); +- _argop_kernels = +- arm_compute::support::cpp14::make_unique(_num_of_kernels); +- +- TensorShape shape{input->info()->tensor_shape()}; +- for (size_t i = 0; i < num_of_interm_tensors; i++) +- { +- shape.set(_axis[i], 1); +- _interm_tensors[i].allocator()->init( +- TensorInfo(shape, input->info()->num_channels(), input->info()->data_type()) +- .set_data_layout(input->info()->data_layout())); +- _interm_tensors[i].allocator()->allocate(); +- } +- +- // Set a vector that is ordered ICLTensors sequentially. +- std::vector tensors; +- tensors.emplace_back(input); +- for (size_t i = 0; i < num_of_interm_tensors; i++) +- { +- tensors.emplace_back(_interm_tensors.get() + i); +- } +- tensors.emplace_back(output); +- +- // Apply ArgMinMax on all kernels +- for (size_t i = 0; i < _num_of_kernels; i++) +- { +- _argop_kernels[i].configure(tensors[i], tensors[i + 1], _axis[i], op); +- } +-} +- +-Status CLArgOperation::validate(const ITensorInfo *input, const std::vector &axis, +- const ITensorInfo *output, ArgOperation op) +-{ +- const size_t num_of_kernels = axis.size(); +- const size_t num_of_interm_tensors = num_of_kernels - 1; +- +- // Create temporary tensor infos +- auto interm_tensors = +- arm_compute::support::cpp14::make_unique(num_of_interm_tensors); +- +- // Create intermediate tensor info +- TensorShape shape{input->tensor_shape()}; +- +- for (size_t i = 0; i < num_of_interm_tensors; i++) +- { +- shape.set(axis[i], 1); +- interm_tensors[i].set_data_type(input->data_type()); +- interm_tensors[i].set_tensor_shape(shape); +- interm_tensors[i].set_num_channels(input->num_channels()); +- } +- +- // Set a vector that is ordered ITensorInfo sequentially. +- std::vector tensors; +- tensors.emplace_back(input); +- for (size_t i = 0; i < num_of_interm_tensors; i++) +- { +- tensors.emplace_back(interm_tensors.get() + i); +- } +- tensors.emplace_back(output); +- +- // Validate argminmax only on all kernels +- for (size_t i = 0; i < num_of_kernels; i++) +- { +- ARM_COMPUTE_RETURN_ON_ERROR( +- CLArgOperationKernel::validate(tensors[i], tensors[i + 1], axis[i], op)); +- } +- +- return Status{}; +-} +- +-void CLArgOperation::run() +-{ +- for (size_t i = 0; i < _num_of_kernels; ++i) +- { +- CLScheduler::get().enqueue(_argop_kernels[i]); +- } +-} +- +-} // namespace arm_compute +diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp +index 92ee69a..e5122ab 100644 +--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp ++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp +@@ -48,7 +48,7 @@ using namespace arm_compute; + void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, + BinaryLogicalOperation op) + { +- auto k = arm_compute::support::cpp14::make_unique(); ++ auto k = support::cpp14::make_unique(); + k->configure(input1, input2, output, op); + _kernel = std::move(k); + +diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp +deleted file mode 100644 +index b3118f3..0000000 +--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp ++++ /dev/null +@@ -1,52 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/runtime/CL/functions/CLCast.h" +- +-#include "arm_compute/core/CL/kernels/CLCastKernel.h" +- +-using namespace arm_compute; +- +-void CLCast::configure(ICLTensor *input, ICLTensor *output, SubDataType input_subtype) +-{ +- auto k = arm_compute::support::cpp14::make_unique(); +- k->configure(input, output, input_subtype); +- _kernel = std::move(k); +-} +diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp +deleted file mode 100644 +index db66250..0000000 +--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp ++++ /dev/null +@@ -1,52 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/runtime/CL/functions/CLDepthToSpace.h" +- +-#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h" +- +-using namespace arm_compute; +- +-void CLDepthToSpace::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size) +-{ +- auto k = arm_compute::support::cpp14::make_unique(); +- k->configure(input, output, block_size); +- _kernel = std::move(k); +-} +diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp +new file mode 100644 +index 0000000..3dede05 +--- /dev/null ++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp +@@ -0,0 +1,267 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++/* ++ * Copyright (c) 2019-2020 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#include "arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h" ++ ++#include "arm_compute/core/Helpers.h" ++#include "arm_compute/core/UtilsEx.h" ++#include "arm_compute/core/Validate.h" ++#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" ++#include "arm_compute/runtime/CL/CLScheduler.h" ++ ++#include ++#include ++ ++namespace arm_compute ++{ ++using namespace arm_compute::misc::shape_calculator; ++ ++CLDirectTransposeConvLayer::CLDirectTransposeConvLayer( ++ std::shared_ptr memory_manager) // NOLINT ++ : _memory_group(std::move(memory_manager)), ++ _scale_f(), ++ _conv_f(), ++ _flip_weights(), ++ _scaled_output(), ++ _original_weights(nullptr), ++ _weights_flipped(), ++ _flip_axis(), ++ _is_prepared(false) ++{ ++} ++ ++Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, ++ const ITensorInfo *bias, ITensorInfo *output, ++ const PadStrideInfo &info, unsigned int invalid_right, ++ unsigned int invalid_bottom, ++ const WeightsInfo &weights_info) ++{ ++ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); ++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( ++ input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32); ++ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); ++ const DataLayout data_layout = input->data_layout(); ++ ++ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); ++ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); ++ const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); ++ ++ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h)); ++ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1); ++ ++ auto out_dims = transposeconv_output_dimensions( ++ input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), ++ weights->dimension(idx_h), info, invalid_right, invalid_bottom); ++ ++ const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights); ++ ++ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights); ++ ++ if (bias != nullptr) ++ { ++ if (is_data_type_quantized_asymmetric(input->data_type())) ++ { ++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); ++ } ++ else ++ { ++ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); ++ } ++ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias); ++ } ++ ++ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w], ++ "Output's width is invalid."); ++ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h], ++ "Output's height is invalid."); ++ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c], ++ "Output's depth is invalid."); ++ ++ unsigned int pad_left = 0; ++ unsigned int pad_right = 0; ++ unsigned int pad_top = 0; ++ unsigned int pad_bottom = 0; ++ const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( ++ *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top, ++ pad_bottom); ++ TensorInfo scale_out_info(input->clone() ++ ->set_is_resizable(true) ++ .reset_padding() ++ .set_tensor_shape(scale_out_shape) ++ .set_data_layout(data_layout)); ++ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); ++ ++ ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info)); ++ ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, ++ conv_info, weights_info)); ++ ++ return Status{}; ++} ++ ++void CLDirectTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, ++ const ICLTensor *bias, ICLTensor *output, ++ const PadStrideInfo &info, unsigned int invalid_right, ++ unsigned int invalid_bottom, ++ const WeightsInfo &weights_info) ++{ ++ configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info, ++ invalid_right, invalid_bottom, weights_info); ++} ++ ++void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_context, ++ ICLTensor *input, ICLTensor *weights, ++ const ICLTensor *bias, ICLTensor *output, ++ const PadStrideInfo &info, unsigned int invalid_right, ++ unsigned int invalid_bottom, ++ const WeightsInfo &weights_info) ++{ ++ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ++ ++ unsigned int pad_left = 0; ++ unsigned int pad_right = 0; ++ unsigned int pad_top = 0; ++ unsigned int pad_bottom = 0; ++ const unsigned int stride_x = info.stride().first; ++ const unsigned int stride_y = info.stride().second; ++ ++ const DataLayout data_layout = input->info()->data_layout(); ++ ++ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); ++ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); ++ ++ _original_weights = weights; ++ _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32)); ++ _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); ++ _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis); ++ ++ auto out_dims = transposeconv_output_dimensions( ++ input->info()->dimension(idx_w), input->info()->dimension(idx_h), ++ weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right, ++ invalid_bottom); ++ ++ const TensorShape output_shape = ++ compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); ++ ++ // Output auto initialization if not yet initialized ++ auto_init_if_empty( ++ *output->info(), ++ input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout)); ++ ++ // Perform validation step ++ ARM_COMPUTE_ERROR_THROW_ON(CLDirectTransposeConvLayer::validate( ++ input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), ++ info, invalid_right, invalid_bottom)); ++ ++ _is_prepared = weights_info.retain_internal_weights(); ++ ++ _memory_group.manage(&_scaled_output); ++ ++ // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order ++ // to match output shape ++ const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( ++ *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, ++ pad_right, pad_top, pad_bottom); ++ ++ TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), ++ input->info()->quantization_info()); ++ scale_out_info.set_data_layout(data_layout); ++ _scaled_output.allocator()->init(scale_out_info); ++ ++ // configure scale function ++ const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, ++ DimensionRoundingType::FLOOR); ++ _scale_f.configure(input, &_scaled_output, upsample_info); ++ ++ // Setup the function to convolve the upscaled output ++ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); ++ _conv_f.configure(compile_context, &_scaled_output, &_weights_flipped, bias, output, conv_info, ++ weights_info); ++ _scaled_output.allocator()->allocate(); ++ ++ // Setup flip axis data ++ _flip_axis.allocator()->allocate(); ++ _flip_axis.map(true); ++ auto axis_data = reinterpret_cast(_flip_axis.buffer()); ++ if (weights->info()->data_layout() == DataLayout::NHWC) ++ { ++ axis_data[0] = 1; ++ axis_data[1] = 2; ++ } ++ else ++ { ++ axis_data[0] = 0; ++ axis_data[1] = 1; ++ } ++ _flip_axis.unmap(); ++} ++ ++void CLDirectTransposeConvLayer::run() ++{ ++ prepare(); ++ ++ MemoryGroupResourceScope scope_mg(_memory_group); ++ ++ _scale_f.run(); ++ _conv_f.run(); ++} ++ ++void CLDirectTransposeConvLayer::prepare() ++{ ++ if (!_is_prepared) ++ { ++ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); ++ ++ // Run weights flipping and mark original weights tensor as unused ++ _weights_flipped.allocator()->allocate(); ++ _flip_weights.run(); ++ _original_weights->mark_as_unused(); ++ ++ // Prepare convolution ++ _conv_f.prepare(); ++ ++ // Free flipped weights ++ if (!_weights_flipped.is_used()) ++ { ++ _weights_flipped.allocator()->free(); ++ } ++ ++ _is_prepared = true; ++ } ++} ++} // namespace arm_compute +diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp +index 3d9a28a..ae9d8af 100644 +--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp ++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp +@@ -47,7 +47,7 @@ using namespace arm_compute; + void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output, + const ICLTensor *lookups) + { +- auto k = arm_compute::support::cpp14::make_unique(); ++ auto k = support::cpp14::make_unique(); + k->configure(input, output, lookups); + _kernel = std::move(k); + } +diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp +index f098832..0198946 100644 +--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp ++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp +@@ -45,7 +45,7 @@ + #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "arm_compute/core/utils/quantization/AsymmHelpers.h" + #include "arm_compute/runtime/CL/CLScheduler.h" +-#include "support/ToolchainSupport.h" ++#include "support/MemorySupport.h" + + #include + +@@ -60,7 +60,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I + ARM_COMPUTE_UNUSED(weights); + ARM_COMPUTE_UNUSED(output); + ARM_COMPUTE_RETURN_ON_ERROR( +- CLGEMMLowpMatrixMultiplyCoreEx::validate(&input, &weights, nullptr, &output)); ++ CLGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output)); + + return Status{}; + } +@@ -68,7 +68,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I + + void CLFullyConnectedHybridLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output) + { +- auto k = arm_compute::support::cpp14::make_unique(); ++ auto k = support::cpp14::make_unique(); + k->configure(input, output); + _kernel = std::move(k); + } +@@ -172,7 +172,8 @@ void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTen + + // Quantize input + _quantized_input.allocator()->init( +- input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8)); ++ input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( ++ DataType::QASYMM8_SIGNED)); + _memory_group.manage(&_quantized_input); + _quant_input_kernel.configure(input, &_scale_factor, &_quantized_input); + +@@ -199,7 +200,7 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe + { + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::S8); ++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); + +@@ -256,8 +257,9 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe + ARM_COMPUTE_RETURN_ON_ERROR(CLScaleFactorSymm8Kernel::validate(input, &scale_factor)); + + // Validate quantization symm8 kernel +- const ITensorInfo &quantized_input = TensorInfo( +- input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8)); ++ const ITensorInfo &quantized_input = ++ TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type( ++ DataType::QASYMM8_SIGNED)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLQuantizationSymmetricKernel::validate(input, &scale_factor, &quantized_input)); + +diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp +index 63e291b..2ff4b96 100644 +--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp ++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp +@@ -46,7 +46,7 @@ + #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "arm_compute/core/utils/quantization/AsymmHelpers.h" + #include "arm_compute/runtime/CL/CLScheduler.h" +-#include "support/ToolchainSupport.h" ++#include "support/MemorySupport.h" + + #include + +@@ -141,7 +141,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I + + void CLFullyConnectedLayerReshapeWeightsEx::configure(const ICLTensor *input, ICLTensor *output) + { +- auto k = arm_compute::support::cpp14::make_unique(); ++ auto k = support::cpp14::make_unique(); + k->configure(input, output); + _kernel = std::move(k); + } +diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp +index 9aebc47..157b4d9 100644 +--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp ++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp +@@ -53,18 +53,21 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp + fc->configure(input_to_use, _weights, _biases, _output); + return std::unique_ptr(fc); + } +- else ++ else if (kernel_type == KernelType::PREPROCESSED_WEIGHTS) + { +- assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS); +- + bool is_hybrid = (input->info()->data_type() == DataType::F32 || + input->info()->data_type() == DataType::F16) && +- weights->info()->data_type() == DataType::S8; ++ (weights->info()->data_type() == DataType::S8 || ++ weights->info()->data_type() == DataType::QASYMM8_SIGNED); + + if (is_hybrid) + { + auto fc = new arm_compute::CLFullyConnectedHybridLayer{_memory_manager}; ++ ITensorInfo *weights_info = const_cast(_weights->info()); ++ const auto orgin_weights_data_type = weights_info->data_type(); ++ weights_info->set_data_type(DataType::QASYMM8_SIGNED); + fc->configure(input_to_use, _weights, _biases, _output); ++ weights_info->set_data_type(orgin_weights_data_type); + return std::unique_ptr(fc); + } + else +@@ -74,6 +77,11 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp + return std::unique_ptr(fc); + } + } ++ else ++ { ++ throw std::runtime_error("CLFullyConnectedReshapingLayer: Unsupported kernel type"); ++ } ++ + }(); + + if (_needs_reshape) +diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp +deleted file mode 100644 +index ca5499d..0000000 +--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp ++++ /dev/null +@@ -1,180 +0,0 @@ +-/* +- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2017-2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h" +- +-#include "arm_compute/core/CL/ICLTensor.h" +-#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h" +-#include "arm_compute/core/Error.h" +-#include "arm_compute/core/Helpers.h" +-#include "arm_compute/core/TensorInfo.h" +-#include "arm_compute/core/Types.h" +-#include "arm_compute/core/Validate.h" +-#include "arm_compute/core/utils/misc/ShapeCalculator.h" +-#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +-#include "arm_compute/runtime/CL/CLScheduler.h" +-#include "arm_compute/runtime/MemoryGroup.h" +- +-namespace arm_compute +-{ +-using namespace arm_compute::misc::shape_calculator; +-using namespace arm_compute::cl_gemm; +- +-namespace +-{ +-inline bool is_gemm_reshaped(bool reshape_b_only_on_first_run, GPUTarget gpu_target) +-{ +- return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (reshape_b_only_on_first_run); +-} +-} // namespace +- +-CLGEMMLowpMatrixMultiplyCoreEx::CLGEMMLowpMatrixMultiplyCoreEx( +- std::shared_ptr memory_manager) +- : _memory_group(std::move(memory_manager)), _mm_midgard_kernel(), _mtx_a_reduction_kernel(), +- _mtx_b_reduction_kernel(), _vector_sum_col(), _vector_sum_row(), _a_offset(0), _b_offset(0), +- _reshape_b_only_on_first_run(false), _is_prepared(false) +-{ +-} +- +-void CLGEMMLowpMatrixMultiplyCoreEx::configure(const ICLTensor *a, const ICLTensor *b, +- const ICLTensor *c, ICLTensor *output, +- const GEMMInfo &gemm_info) +-{ +- ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); +- ARM_COMPUTE_UNUSED(c); +- ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCoreEx::validate( +- a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info)); +- +- _is_prepared = false; +- _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); +- _a_offset = a->info()->quantization_info().uniform().offset; +- _b_offset = b->info()->quantization_info().uniform().offset; +- +- // Get the GPU target +- const GPUTarget gpu_target = CLScheduler::get().target(); +- +- // Set the target for the kernels +- _mm_midgard_kernel.set_target(gpu_target); +- +- // GEMMRHSMatrixInfo rhs_info; +- // GEMMLHSMatrixInfo lhs_info; +- +- // Arguments used by GEMMReshapeInfo +- // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, +- // n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo +- // in order to know how the matrices have been reshaped +- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); +- const unsigned int m = reinterpret_input_as_3d +- ? (a->info()->dimension(1) * a->info()->dimension(2)) +- : a->info()->dimension(1); +- const unsigned int n = b->info()->dimension(0); +- const unsigned int k = a->info()->dimension(0); +- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); +- +- const ICLTensor *matrix_b = b; +- // Configure matrix multiply kernel +- _mm_midgard_kernel.configure( +- a, matrix_b, output, +- GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); +-} +- +-Status CLGEMMLowpMatrixMultiplyCoreEx::validate(const ITensorInfo *a, const ITensorInfo *b, +- const ITensorInfo *c, const ITensorInfo *output, +- const GEMMInfo &gemm_info) +-{ +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::S8); +- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); +- ARM_COMPUTE_UNUSED(c); +- +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), +- "Matrix A already reshaped is not supported"); +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), +- "Matrix B already reshaped is not supported"); +- +- const ITensorInfo *matrix_a_info = a; +- +- // Get the GPU target +- const GPUTarget gpu_target = CLScheduler::get().target(); +- +- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); +- const unsigned int m = +- reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); +- const unsigned int n = b->dimension(0); +- const unsigned int k = a->dimension(0); +- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); +- +- bool reshape_matrix_b = is_gemm_reshaped(gemm_info.reshape_b_only_on_first_run(), gpu_target); +- +- const GEMMReshapeInfo reshape_info = +- GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d); +- +- TensorInfo weights_info(*b); +- const ITensorInfo *matrix_b_info = &weights_info; +- if (reshape_matrix_b) +- { +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(false, +- "CLGEMMLowpMatrixMultiplyCoreEx does not support reshape_b"); +- } +- +- // Validate matrix multiply +- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernelEx::validate( +- matrix_a_info, matrix_b_info, output, reshape_info)); +- +- return Status{}; +-} +- +-void CLGEMMLowpMatrixMultiplyCoreEx::run() +-{ +- prepare(); +- +- MemoryGroupResourceScope scope_mg(_memory_group); +- +- // Run matrix multiply +- CLScheduler::get().enqueue(_mm_midgard_kernel, false); +-} +- +-void CLGEMMLowpMatrixMultiplyCoreEx::prepare() +-{ +- if (!_is_prepared) +- { +- _is_prepared = true; +- } +-} +-} // namespace arm_compute +diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp +index f594d7a..e0b833b 100644 +--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp ++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp +@@ -48,7 +48,7 @@ using namespace arm_compute; + void CLGatherEx::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, + int axis) + { +- auto k = arm_compute::support::cpp14::make_unique(); ++ auto k = support::cpp14::make_unique(); + k->configure(input, indices, output, axis); + _kernel = std::move(k); + } +diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp +index 27ed8e8..65b89a3 100644 +--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp ++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp +@@ -47,7 +47,7 @@ using namespace arm_compute; + void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys, + const ICLTensor *input, ICLTensor *output, ICLTensor *hits) + { +- auto k = arm_compute::support::cpp14::make_unique(); ++ auto k = support::cpp14::make_unique(); + k->configure(lookups, keys, input, output, hits); + _kernel = std::move(k); + } +diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp +index 80393e8..5a7e408 100644 +--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp ++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp +@@ -50,7 +50,7 @@ CLInstanceNormalizationLayerEx::CLInstanceNormalizationLayerEx() {} + void CLInstanceNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output, + ICLTensor *gamma, ICLTensor *beta, float epsilon) + { +- auto k = arm_compute::support::cpp14::make_unique(); ++ auto k = support::cpp14::make_unique(); + k->configure(input, output, gamma, beta, epsilon); + _kernel = std::move(k); + } +diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp +deleted file mode 100644 +index fbb15ab..0000000 +--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp ++++ /dev/null +@@ -1,63 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/runtime/CL/functions/CLPReLU.h" +- +-#include "arm_compute/core/CL/kernels/CLPReLUKernel.h" +-#include "arm_compute/core/CL/ICLTensor.h" +- +-using namespace arm_compute; +- +-void CLPReLU::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output) +-{ +- auto k = arm_compute::support::cpp14::make_unique(); +- k->configure(input, alpha, output); +- _kernel = std::move(k); +- +- if (output->info()->dimension(0) > 1) +- { +- ICLTensor *broadcasted_info = (input->info()->dimension(0) == 1) ? input : alpha; +- +- if (broadcasted_info->info()->dimension(0) == 1) +- { +- _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); +- } +- } +-} +diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp +deleted file mode 100644 +index 6049b7e..0000000 +--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp ++++ /dev/null +@@ -1,163 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/runtime/CL/functions/CLRNNLayerEx.h" +- +-#include "arm_compute/core/Helpers.h" +-#include "arm_compute/core/Types.h" +-#include "arm_compute/core/Utils.h" +-#include "arm_compute/core/utils/misc/ShapeCalculator.h" +-#include "arm_compute/runtime/CL/CLScheduler.h" +-#include "support/ToolchainSupport.h" +- +-#include +- +-using namespace arm_compute; +-using namespace arm_compute::misc::shape_calculator; +- +-CLRNNLayerEx::CLRNNLayerEx(std::shared_ptr memory_manager) +- : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), +- _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), +- _gemm_output(), _add_output(), _is_prepared(false) +-{ +-} +- +-Status CLRNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights, +- const ITensorInfo *recurrent_weights, const ITensorInfo *bias, +- const ITensorInfo *hidden_state, const ITensorInfo *output, +- const ActivationLayerInfo &info) +-{ +- const int idx_width = 0; +- const int idx_height = 1; +- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, +- output); +- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width)); +- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) != +- recurrent_weights->dimension(idx_width)); +- ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) != +- recurrent_weights->dimension(1)); +- ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1); +- ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height)); +- ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height)); +- ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height)); +- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), +- hidden_state->tensor_shape()); +- +- auto shape_info = +- TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, +- input->data_type()); +- +- ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info)); +- ARM_COMPUTE_RETURN_ON_ERROR( +- CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f)); +- ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate( +- ArithmeticOperation::ADD, &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); +- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info)); +- +- return Status{}; +-} +- +-void CLRNNLayerEx::configure(const ICLTensor *input, const ICLTensor *weights, +- const ICLTensor *recurrent_weights, const ICLTensor *bias, +- ICLTensor *hidden_state, ICLTensor *output, ActivationLayerInfo &info) +-{ +- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); +- ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayerEx::validate(input->info(), weights->info(), +- recurrent_weights->info(), bias->info(), +- hidden_state->info(), output->info(), info)); +- +- const int idx_height = 1; +- TensorShape shape = +- compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height)); +- +- _is_prepared = false; +- +- _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); +- _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); +- +- // Manage intermediate buffers and configure +- _memory_group.manage(&_fully_connected_out); +- _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out); +- +- _memory_group.manage(&_gemm_output); +- _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f); +- +- _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); +- _memory_group.manage(&_add_output); +- +- _add_kernel.configure(ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output, +- &_add_output, ConvertPolicy::SATURATE); +- +- _fully_connected_out.allocator()->allocate(); +- _gemm_output.allocator()->allocate(); +- +- _activation_kernel.configure(&_add_output, hidden_state, info); +- _add_output.allocator()->allocate(); +- +- _copy_kernel.configure(hidden_state, output); +-} +- +-void CLRNNLayerEx::run() +-{ +- prepare(); +- +- _memory_group.acquire(); +- +- _fully_connected_kernel.run(); +- _gemm_state_f.run(); +- CLScheduler::get().enqueue(_add_kernel); +- CLScheduler::get().enqueue(_activation_kernel); +- +- // copy hidden out to output +- CLScheduler::get().enqueue(_copy_kernel); +- +- _memory_group.release(); +-} +- +-void CLRNNLayerEx::prepare() +-{ +- if (!_is_prepared) +- { +- _fully_connected_kernel.prepare(); +- _gemm_state_f.prepare(); +- +- _is_prepared = true; +- } +-} +diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp +index 8ce2d74..a41e6db 100644 +--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp ++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp +@@ -60,8 +60,7 @@ Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo * + const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0); + + // Create temporary tensor infos +- auto interm_tensors = +- arm_compute::support::cpp14::make_unique(num_of_interm_tensors); ++ auto interm_tensors = support::cpp14::make_unique(num_of_interm_tensors); + + // Create intermediate tensor info + TensorShape shape{input->tensor_shape()}; +@@ -119,9 +118,8 @@ void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output, + const size_t num_of_kernels = axis.size(); + const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0); + +- _interm_tensors = arm_compute::support::cpp14::make_unique(num_of_interm_tensors); +- _reduce_kernels = +- arm_compute::support::cpp14::make_unique(num_of_kernels); ++ _interm_tensors = support::cpp14::make_unique(num_of_interm_tensors); ++ _reduce_kernels = support::cpp14::make_unique(num_of_kernels); + + // Set a vector that is ordered ICLTensors sequentially. + std::vector tensors; +diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp +deleted file mode 100644 +index 7d7b226..0000000 +--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp ++++ /dev/null +@@ -1,52 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2016-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/runtime/CL/functions/CLSpaceToDepth.h" +- +-#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h" +- +-using namespace arm_compute; +- +-void CLSpaceToDepth::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size) +-{ +- auto k = arm_compute::support::cpp14::make_unique(); +- k->configure(input, output, block_size); +- _kernel = std::move(k); +-} +diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp +index e61746e..3215d01 100644 +--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp ++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp +@@ -15,7 +15,7 @@ + */ + + /* +- * Copyright (c) 2017-2018 ARM Limited. ++ * Copyright (c) 2017-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * +@@ -37,218 +37,124 @@ + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +- + #include "arm_compute/runtime/CL/functions/CLTransposeConvLayer.h" +-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" + +-#include "arm_compute/core/Helpers.h" + #include "arm_compute/core/Utils.h" +-#include "arm_compute/core/UtilsEx.h" + #include "arm_compute/core/Validate.h" + #include "arm_compute/core/utils/misc/ShapeCalculator.h" ++#include "arm_compute/core/utils/quantization/AsymmHelpers.h" + #include "arm_compute/runtime/CL/CLScheduler.h" +-#include "arm_compute/runtime/CPP/CPPScheduler.h" + ++#include + #include + #include + + using namespace arm_compute; + using namespace arm_compute::misc::shape_calculator; + +-CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr memory_manager) // NOLINT +- : _memory_group(std::move(memory_manager)), +- _scale_f(), +- _conv_f(), +- _flip_weights(), +- _scaled_output(), +- _original_weights(nullptr), +- _weights_flipped(), +- _is_prepared(false) ++CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr memory_manager) ++ : _memory_manager(std::move(memory_manager)), _function() ++{ ++} ++ ++void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ++ ICLTensor *output, const PadStrideInfo &deconv_info, ++ unsigned int invalid_right, unsigned int invalid_bottom, ++ const WeightsInfo &weights_info) + { ++ configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info, ++ invalid_right, invalid_bottom, weights_info); ++} ++ ++void CLTransposeConvLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ++ ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, ++ const PadStrideInfo &deconv_info, unsigned int invalid_right, ++ unsigned int invalid_bottom, const WeightsInfo &weights_info) ++{ ++ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ++ ++ switch (CLTransposeConvLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, ++ output->info(), deconv_info, invalid_right, ++ invalid_bottom, weights_info)) ++ { ++ case DeconvolutionMethod::DIRECT: ++ { ++ auto f = arm_compute::support::cpp14::make_unique(); ++ f->configure(compile_context, input, weights, bias, output, deconv_info, invalid_right, ++ invalid_bottom, weights_info); ++ _function = std::move(f); ++ break; ++ } ++ case DeconvolutionMethod::GEMM: ++ { ++ auto f = arm_compute::support::cpp14::make_unique(_memory_manager); ++ f->configure(compile_context, input, weights, bias, output, deconv_info); ++ _function = std::move(f); ++ break; ++ } ++ default: ++ ARM_COMPUTE_ERROR("Not supported."); ++ break; ++ } + } + + Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, ITensorInfo *output, +- const PadStrideInfo &info, unsigned int invalid_right, ++ const PadStrideInfo &deconv_info, unsigned int invalid_right, + unsigned int invalid_bottom, const WeightsInfo &weights_info) + { + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, +- DataType::F32); +- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); +- +- const DataLayout data_layout = input->data_layout(); +- +- const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); +- const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); +- const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); +- +- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h)); +- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1); +- +- const unsigned int kernel_x = weights->dimension(idx_w); +- const unsigned int kernel_y = weights->dimension(idx_h); +- +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_right > kernel_x - 1, +- "invalid_right must be smaller than kernel_x"); +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_bottom > kernel_y - 1, +- "inner_border_top must be smaller than kernel_y"); +- +- // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were added. +- auto out_dims = transposeconv_output_dimensions( +- input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), +- weights->dimension(idx_h), info, invalid_right, invalid_bottom); +- +- const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights); +- +- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights); +- +- if (bias != nullptr) ++ switch (CLTransposeConvLayer::get_deconvolution_method( ++ input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info)) + { +- if (is_data_type_quantized_asymmetric(input->data_type())) ++ case DeconvolutionMethod::DIRECT: + { +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); ++ // Validate direct convolution layer ++ ARM_COMPUTE_RETURN_ON_ERROR(CLDirectTransposeConvLayer::validate( ++ input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info)); ++ break; + } +- else ++ case DeconvolutionMethod::GEMM: + { +- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); ++ // Validate gemm-based convolution layer ++ ARM_COMPUTE_RETURN_ON_ERROR( ++ CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info)); ++ break; + } +- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias); ++ default: ++ ARM_COMPUTE_ERROR("Not supported."); ++ break; + } + +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w], +- "Output's width is invalid."); +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h], +- "Output's height is invalid."); +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c], +- "Output's depth is invalid."); +- +- unsigned int pad_left = 0; +- unsigned int pad_right = 0; +- unsigned int pad_top = 0; +- unsigned int pad_bottom = 0; +- const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( +- *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top, +- pad_bottom); +- TensorInfo scale_out_info(input->clone() +- ->set_is_resizable(true) +- .reset_padding() +- .set_tensor_shape(scale_out_shape) +- .set_data_layout(data_layout)); +- const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); +- +- ARM_COMPUTE_RETURN_ON_ERROR( +- CLTransposeConvLayerUpsample::validate(input, &scale_out_info, BorderSize(0, 0), info)); +- ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, +- conv_info, weights_info)); +- + return Status{}; + } + +-void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, +- ICLTensor *output, const PadStrideInfo &info, +- unsigned int invalid_right, unsigned int invalid_bottom, +- const WeightsInfo &weights_info) ++DeconvolutionMethod CLTransposeConvLayer::get_deconvolution_method( ++ const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ++ ITensorInfo *output, const PadStrideInfo &deconv_info, unsigned int invalid_right, ++ unsigned int invalid_bottom, const WeightsInfo &weights_info) + { +- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); +- +- const unsigned int stride_x = info.stride().first; +- const unsigned int stride_y = info.stride().second; ++ ARM_COMPUTE_UNUSED(output, bias, weights_info); + +- const DataLayout data_layout = input->info()->data_layout(); ++ const DataLayout data_layout = input->data_layout(); + + const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + +- _original_weights = weights; +- _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); +- _flip_weights.configure(weights, &_weights_flipped); +- +- // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were +- // added. +- auto out_dims = transposeconv_output_dimensions( +- input->info()->dimension(idx_w), input->info()->dimension(idx_h), +- weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right, +- invalid_bottom); +- +- const TensorShape output_shape = +- compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); +- +- // Output auto initialization if not yet initialized +- auto_init_if_empty( +- *output->info(), +- input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout)); +- +- // Perform validation step +- ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayer::validate( +- input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), +- info, invalid_right, invalid_bottom)); +- +- _is_prepared = weights_info.retain_internal_weights(); +- +- _memory_group.manage(&_scaled_output); +- +- // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order +- // to match output shape +- unsigned int pad_left = 0; +- unsigned int pad_right = 0; +- unsigned int pad_top = 0; +- unsigned int pad_bottom = 0; +- const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( +- *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, +- pad_right, pad_top, pad_bottom); +- +- TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), +- input->info()->quantization_info()); +- scale_out_info.set_data_layout(data_layout); +- _scaled_output.allocator()->init(scale_out_info); +- +- // configure scale function +- const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, +- DimensionRoundingType::FLOOR); +- _scale_f.configure(input, &_scaled_output, BorderSize(0, 0), upsample_info); +- +- // setup the function to convolve the upscaled output +- const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); +- _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info); +- _scaled_output.allocator()->allocate(); ++ if (weights->dimension(idx_w) != deconv_info.stride().first || ++ weights->dimension(idx_h) != deconv_info.stride().second || invalid_right != 0 || ++ invalid_bottom != 0) ++ { ++ return DeconvolutionMethod::DIRECT; ++ } ++ ++ return DeconvolutionMethod::GEMM; + } + + void CLTransposeConvLayer::run() + { + prepare(); +- +- _memory_group.acquire(); +- +- _scale_f.run(); +- _conv_f.run(); +- +- _memory_group.release(); ++ _function->run(); + } + +-void CLTransposeConvLayer::prepare() +-{ +- if (!_is_prepared) +- { +- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); +- +- // Run weights flipping and mark original weights tensor as unused +- _weights_flipped.allocator()->allocate(); +- _weights_flipped.map(true); +- _original_weights->map(CLScheduler::get().queue(), true); +- CPPScheduler::get().schedule(&_flip_weights, Window::DimZ); +- _weights_flipped.unmap(); +- _original_weights->unmap(CLScheduler::get().queue()); +- _original_weights->mark_as_unused(); +- +- // Prepare convolution +- _conv_f.prepare(); +- +- if (!_weights_flipped.is_used()) +- { +- _weights_flipped.allocator()->free(); +- } +- +- _is_prepared = true; +- } +-} ++void CLTransposeConvLayer::prepare() { _function->prepare(); } +diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp +deleted file mode 100644 +index 07feb5a..0000000 +--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp ++++ /dev/null +@@ -1,92 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2017-2018 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h" +- +-#include "arm_compute/core/CL/OpenCL.h" +-#include "arm_compute/core/Utils.h" +-#include "arm_compute/runtime/CL/CLScheduler.h" +-#include "arm_compute/core/CL/ICLTensor.h" +- +-#include +-#include +-#include +- +-using namespace arm_compute; +- +-CLTransposeConvLayerUpsample::CLTransposeConvLayerUpsample() // NOLINT +- : _upsample(), +- _output(nullptr) +-{ +-} +- +-Status CLTransposeConvLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, +- const BorderSize &inner_border, +- const PadStrideInfo &info) +-{ +- return CLTransposeConvLayerUpsampleKernel::validate(input, output, inner_border, info); +-} +- +-void CLTransposeConvLayerUpsample::configure(ICLTensor *input, ICLTensor *output, +- const BorderSize &inner_border, +- const PadStrideInfo &info) +-{ +- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); +- +- _output = output; +- _upsample.configure(input, _output, inner_border, info); +-} +- +-void CLTransposeConvLayerUpsample::run() +-{ +- _output->map(CLScheduler::get().queue(), true); +- if (is_data_type_quantized_asymmetric(_output->info()->data_type())) +- { +- const uint8_t quantized_zero = _output->info()->quantization_info().uniform().offset; +- std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero); +- } +- else +- { +- memset(_output->buffer(), 0, _output->info()->total_size()); +- } +- _output->unmap(CLScheduler::get().queue()); +- +- CLScheduler::get().enqueue(_upsample, false); +-} +diff --git a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp +index 114e1a7..768c15b 100644 +--- a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp ++++ b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp +@@ -41,14 +41,14 @@ + #include "arm_compute/runtime/CPP/functions/CPPOneHotEx.h" + + #include "arm_compute/core/CPP/kernels/CPPOneHotKernelEx.h" +-#include "support/ToolchainSupport.h" ++#include "support/MemorySupport.h" + + using namespace arm_compute; + + void CPPOneHotEx::configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value, + const ITensor *off_value, ITensor *output, const int axis) + { +- auto k = arm_compute::support::cpp14::make_unique(); ++ auto k = support::cpp14::make_unique(); + k->configure(indices, depth, on_value, off_value, output, axis); + _kernel = std::move(k); + } +diff --git a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp +deleted file mode 100644 +index 6c90ef3..0000000 +--- a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp ++++ /dev/null +@@ -1,53 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2017-2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/runtime/CPP/functions/CPPUpsampleEx.h" +- +-#include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h" +-#include "support/ToolchainSupport.h" +- +-using namespace arm_compute; +- +-void CPPUpsampleEx::configure(const ITensor *input, ITensor *output, const PadStrideInfo &info) +-{ +- auto k = arm_compute::support::cpp14::make_unique(); +- k->configure(input, output, info); +- _kernel = std::move(k); +-} +diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp +index ff81ff8..2752eb6 100644 +--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp ++++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp +@@ -42,7 +42,7 @@ + + #include "arm_compute/core/NEON/kernels/NEActivationLayerKernelEx.h" + #include "arm_compute/runtime/IRuntimeContext.h" +-#include "support/ToolchainSupport.h" ++#include "support/MemorySupport.h" + + namespace arm_compute + { +@@ -53,7 +53,7 @@ NEActivationLayerEx::NEActivationLayerEx(IRuntimeContext *ctx) // NOLINT + void NEActivationLayerEx::configure(ITensor *input, ITensor *output, + ActivationLayerInfo activation_info) + { +- auto k = arm_compute::support::cpp14::make_unique(); ++ auto k = support::cpp14::make_unique(); + k->configure(input, output, activation_info); + _kernel = std::move(k); + } +diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp +index e42c453..2fc94b2 100644 +--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp ++++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp +@@ -42,7 +42,7 @@ + #include + + #include "arm_compute/core/ITensor.h" +-#include "support/ToolchainSupport.h" ++#include "support/MemorySupport.h" + + #include + +@@ -53,7 +53,7 @@ template + void NEBinaryLogicalOperationStatic::configure(ITensor *input1, ITensor *input2, + ITensor *output) + { +- auto k = arm_compute::support::cpp14::make_unique(); ++ auto k = support::cpp14::make_unique(); + k->configure(COP, input1, input2, output); + _kernel = std::move(k); + } +@@ -69,7 +69,7 @@ Status NEBinaryLogicalOperationStatic::validate(const ITensorInfo *input1, + void NEBinaryLogicalOperation::configure(ITensor *input1, ITensor *input2, ITensor *output, + BinaryLogicalOperation op) + { +- auto k = arm_compute::support::cpp14::make_unique(); ++ auto k = support::cpp14::make_unique(); + k->configure(op, input1, input2, output); + _kernel = std::move(k); + } +diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp +deleted file mode 100644 +index dc5c620..0000000 +--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp ++++ /dev/null +@@ -1,60 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2017-2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/runtime/NEON/functions/NECast.h" +- +-#include "arm_compute/core/NEON/kernels/NECastKernel.h" +-#include "support/ToolchainSupport.h" +- +-namespace arm_compute +-{ +-void NECast::configure(const ITensor *input, ITensor *output, SubDataType input_subtype) +-{ +- auto k = arm_compute::support::cpp14::make_unique(); +- k->configure(input, output, input_subtype); +- _kernel = std::move(k); +-} +- +-Status NECast::validate(const ITensorInfo *input, const ITensorInfo *output, +- SubDataType input_subtype) +-{ +- return NECastKernel::validate(input, output, input_subtype); +-} +-} // namespace arm_compute +diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp +deleted file mode 100644 +index 5ec0b86..0000000 +--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp ++++ /dev/null +@@ -1,63 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h" +- +-#include "arm_compute/core/Error.h" +-#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h" +-#include "arm_compute/core/TensorInfo.h" +-#include "arm_compute/core/Types.h" +-#include "arm_compute/core/Validate.h" +- +-namespace arm_compute +-{ +-void NEDepthToSpaceLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape) +-{ +- auto k = arm_compute::support::cpp14::make_unique(); +- k->configure(input, output, block_shape); +- _kernel = std::move(k); +-} +- +-Status NEDepthToSpaceLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, +- int32_t block_shape) +-{ +- return NEDepthToSpaceLayerKernelEx::validate(input, output, block_shape); +-} +-} // namespace arm_compute +diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp +index 53fb150..e0ab3e0 100644 +--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp ++++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp +@@ -41,13 +41,13 @@ + #include "arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h" + + #include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h" +-#include "support/ToolchainSupport.h" ++#include "support/MemorySupport.h" + + using namespace arm_compute; + + void NEEmbeddingLookup::configure(const ITensor *input, ITensor *output, const ITensor *lookups) + { +- auto k = arm_compute::support::cpp14::make_unique(); ++ auto k = support::cpp14::make_unique(); + k->configure(input, output, lookups); + _kernel = std::move(k); + } +diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp +index f457732..a123439 100644 +--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp ++++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp +@@ -58,7 +58,7 @@ namespace + Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output) + { + ARM_COMPUTE_RETURN_ON_ERROR( +- NEGEMMLowpMatrixMultiplyCoreEx::validate(&input, &weights, nullptr, &output)); ++ NEGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output)); + + return Status{}; + } +@@ -66,7 +66,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I + + void NEFullyConnectedHybridLayerReshapeWeights::configure(const ITensor *input, ITensor *output) + { +- auto k = arm_compute::support::cpp14::make_unique(); ++ auto k = support::cpp14::make_unique(); + k->configure(input, output); + _kernel = std::move(k); + } +@@ -158,7 +158,8 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor + + // Quantize input + _quantized_input.allocator()->init( +- input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8)); ++ input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( ++ DataType::QASYMM8_SIGNED)); + _scale_factor.allocator()->init( + TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32)); + _quant_input_kernel.configure(input, &_quantized_input, &_scale_factor); +@@ -186,7 +187,7 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe + ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::S8); ++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2); +@@ -224,8 +225,9 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); + + // Validate quantization kernel +- const ITensorInfo &quantized_input = TensorInfo( +- input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8)); ++ const ITensorInfo &quantized_input = ++ TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type( ++ DataType::QASYMM8_SIGNED)); + const ITensorInfo &scale_factor = TensorInfo(TensorShape{output->dimension(1)}, 1, DataType::F32); + ARM_COMPUTE_RETURN_ON_ERROR( + NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor)); +diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp +index fcac3c7..dc6c784 100644 +--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp ++++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp +@@ -56,12 +56,17 @@ void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input + assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS); + + bool is_hybrid = input->info()->data_type() == DataType::F32 && +- weights->info()->data_type() == DataType::S8; ++ (weights->info()->data_type() == DataType::S8 || ++ weights->info()->data_type() == DataType::QASYMM8_SIGNED); + + if (is_hybrid) + { + auto fc = new arm_compute::NEFullyConnectedHybridLayer{_memory_manager}; ++ ITensorInfo *weights_info = const_cast(_weights->info()); ++ const auto orgin_weights_data_type = weights_info->data_type(); ++ weights_info->set_data_type(DataType::QASYMM8_SIGNED); + fc->configure(input_to_use, _weights, _biases, _output); ++ weights_info->set_data_type(orgin_weights_data_type); + return std::unique_ptr(fc); + } + else +diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp +deleted file mode 100644 +index 1290cfd..0000000 +--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp ++++ /dev/null +@@ -1,513 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2017-2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h" +- +-#include "arm_compute/core/Error.h" +-#include "arm_compute/core/Helpers.h" +-#include "arm_compute/core/ITensor.h" +-#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" +-#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" +-#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" +-#include "arm_compute/core/TensorInfo.h" +-#include "arm_compute/core/Types.h" +-#include "arm_compute/core/Validate.h" +-#include "arm_compute/core/utils/misc/ShapeCalculator.h" +-#include "arm_compute/runtime/NEON/NEScheduler.h" +-#include "arm_compute/runtime/TensorAllocator.h" +-#include "support/ToolchainSupport.h" +- +-using namespace arm_compute; +-using namespace arm_compute::misc::shape_calculator; +- +-NEGEMMLowpMatrixMultiplyCoreEx::NEGEMMLowpMatrixMultiplyCoreEx( +- std::shared_ptr memory_manager) +- : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), +- _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), +- _mtx_b_reduction_kernel(), _offset_contribution_kernel(), +- _offset_contribution_output_stage_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), +- _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0), +- _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false), +- _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), +- _fuse_output_stage(false), _flip_signedness(false) +-{ +-} +- +-void NEGEMMLowpMatrixMultiplyCoreEx::configure(const ITensor *a, const ITensor *b, const ITensor *c, +- ITensor *output, const GEMMInfo &gemm_info) +-{ +- ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); +- ARM_COMPUTE_UNUSED(c); +- ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCoreEx::validate( +- a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info)); +- +- const ITensor *matrix_a = a; +- const ITensor *matrix_b = b; +- GEMMInfo info = gemm_info; +- +- // Clear state +- _mtx_a_reshape_kernel = nullptr; +- _mtx_b_reshape_kernel = nullptr; +- +- // Set internal variables +- _a_offset = a->info()->quantization_info().uniform().offset; +- _b_offset = b->info()->quantization_info().uniform().offset; +- _run_vector_matrix_multiplication = a->info()->dimension(1) < 2; +- _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run(); +- _is_prepared = false; +- _fused_assembly_path = false; +- _original_b = b; +- +- const ITensor *a_to_use = a; +- +- // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage +- if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) +- { +- _fuse_output_stage = true; +- _memory_group.manage(&_mm_result_s32); +- TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32); +- _mm_result_s32.allocator()->init(info_mm_result_s32); +- } +- +-#ifdef __aarch64__ +- switch (a->info()->data_type()) +- { +- case DataType::QASYMM8: +- case DataType::QASYMM8_SIGNED: +- case DataType::U8: +- case DataType::S8: +- { +- if (a_to_use->info()->data_type() == DataType::QASYMM8 && +- info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) +- { +- _asm_glue.configure(a_to_use, b, c, output, gemm_info); +- _fused_assembly_path = _asm_glue.is_configured(); +- } +- else +- { +- _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, +- gemm_info); +- } +- _assembly_path = _asm_glue.is_configured(); +- break; +- } +- default: +- { +- ARM_COMPUTE_ERROR("Datatype not supported"); +- break; +- } +- } +-#endif /* __aarch64__ */ +- if (!(_assembly_path || _run_vector_matrix_multiplication)) +- { +- matrix_a = &_tmp_a; +- matrix_b = &_tmp_b; +- +- // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / +- // 4.0f) ] +- TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1, +- a_to_use->info()->data_type(), a_to_use->info()->quantization_info()); +- // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / +- // 16.0f) ] +- TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), +- b->info()->quantization_info()); +- _tmp_a.allocator()->init(a_info); +- _tmp_b.allocator()->init(b_info); +- _memory_group.manage(&_tmp_a); +- if (!_reshape_b_only_on_first_run) +- { +- _memory_group.manage(&_tmp_b); +- } +- +- // Configure interleave kernel +- { +- auto k = arm_compute::support::cpp14::make_unique(); +- k->configure(a_to_use, &_tmp_a); +- _mtx_a_reshape_kernel = std::move(k); +- } +- +- // Configure transpose kernel +- { +- auto k = arm_compute::support::cpp14::make_unique(); +- k->configure(b, &_tmp_b); +- _mtx_b_reshape_kernel = std::move(k); +- } +- } +- +- if (!_fused_assembly_path) +- { +- // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 +- if (_a_offset != 0) +- { +- TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32); +- +- _vector_sum_col.allocator()->init(info_vector_sum_col); +- if (!_reshape_b_only_on_first_run) +- { +- _memory_group.manage(&_vector_sum_col); +- } +- +- // Configure Matrix B reduction kernel +- _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a_to_use->info()->dimension(0), false); +- } +- +- // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 +- if (_b_offset != 0) +- { +- TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32); +- +- _vector_sum_row.allocator()->init(info_vector_sum_row); +- _memory_group.manage(&_vector_sum_row); +- +- // Configure matrix A reduction kernel +- _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, a_to_use->info()->dimension(0), +- false); +- } +- +- if (_fuse_output_stage) +- { +- // Configure matrix multiply kernel +- if (!_assembly_path) +- { +- auto k = arm_compute::support::cpp14::make_unique(); +- k->configure(matrix_a, matrix_b, &_mm_result_s32); +- _mm_kernel = std::move(k); +- } +- +- _offset_contribution_output_stage_kernel.configure( +- &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, +- _b_offset == 0 ? nullptr : &_vector_sum_row, c, +- _flip_signedness ? &_signed_output : output, a->info()->dimension(0), _a_offset, +- _b_offset, info.gemmlowp_output_stage()); +- } +- else +- { +- // Configure matrix multiply kernel +- if (!_assembly_path) +- { +- auto k = arm_compute::support::cpp14::make_unique(); +- k->configure(matrix_a, matrix_b, output); +- _mm_kernel = std::move(k); +- } +- // Configure offset contribution kernel +- _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, +- _b_offset == 0 ? nullptr : &_vector_sum_row, +- a_to_use->info()->dimension(0), _a_offset, _b_offset); +- } +- } +- +- // Allocate tensors +- if (!_assembly_path && !_run_vector_matrix_multiplication) +- { +- _tmp_a.allocator()->allocate(); +- if (!_reshape_b_only_on_first_run) +- { +- _tmp_b.allocator()->allocate(); +- } +- } +- +- if (!_fused_assembly_path) +- { +- if (_a_offset != 0 && !_reshape_b_only_on_first_run) +- { +- _vector_sum_col.allocator()->allocate(); +- } +- +- if (_b_offset != 0) +- { +- _vector_sum_row.allocator()->allocate(); +- } +- } +- +- if (_fuse_output_stage) +- { +- _mm_result_s32.allocator()->allocate(); +- } +-} +- +-Status NEGEMMLowpMatrixMultiplyCoreEx::validate(const ITensorInfo *a, const ITensorInfo *b, +- const ITensorInfo *c, const ITensorInfo *output, +- const GEMMInfo &gemm_info) +-{ +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::S8); +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::S8); +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); +- ARM_COMPUTE_RETURN_ERROR_ON_MSG( +- c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, +- "Bias addition not supported in NEGEMMLowpMatrixMultiplyCoreEx for output S32"); +- ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1), +- "The product AB is defined only if the number of columns in A is " +- "equal to the number of rows in B"); +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), +- "Matrix A already reshaped is not supported"); +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), +- "Matrix B already reshaped is not supported"); +- +- GEMMInfo info = gemm_info; +- const ITensorInfo *matrix_a_info = a; +- const ITensorInfo *matrix_b_info = b; +- +- const ITensorInfo *a_to_use = a; +- +- TensorInfo tmp_a_info{}; +- TensorInfo tmp_b_info{}; +- TensorInfo mm_result_s32_info{}; +- +- int32_t a_offset = a->quantization_info().uniform().offset; +- int32_t b_offset = b->quantization_info().uniform().offset; +- +- bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE; +- if (fuse_output_stage) +- { +- auto_init_if_empty( +- mm_result_s32_info, +- a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32)); +- } +- +- // Check if we need to run the optimized assembly kernel +- bool run_optimised = false; +- bool run_optimised_requantized = false; +- if (a_to_use->data_type() == DataType::QASYMM8 && +- info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) +- { +- run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info)); +- run_optimised_requantized = run_optimised; +- } +- else +- { +- run_optimised = bool(NEGEMMAssemblyDispatch::validate( +- a_to_use, b, c, fuse_output_stage ? &mm_result_s32_info : output, gemm_info)); +- } +- +- if (run_optimised) +- { +- ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0)); +- if (info.depth_output_gemm3d() != 0) +- { +- if (info.reinterpret_input_as_3d()) +- { +- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); +- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2)); +- } +- else +- { +- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2)); +- } +- } +- else +- { +- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); +- } +- } +- else +- { +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), +- "NEGEMM cannot reinterpret the input tensor as 3D"); +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, +- "NEGEMM cannot reinterpret the output tensor as 3D"); +- +- const bool run_vector_matrix_multiplication = a->dimension(1) < 2; +- if (!run_vector_matrix_multiplication) +- { +- matrix_a_info = &tmp_a_info; +- matrix_b_info = &tmp_b_info; +- +- // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / +- // 4.0f) ] +- TensorShape shape_tmp_a = a->tensor_shape(); +- shape_tmp_a.set(0, a->dimension(0) * 4); +- shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f)); +- +- // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width +- // / 16.0f) ] +- TensorShape shape_tmp_b = b->tensor_shape(); +- shape_tmp_b.set(0, b->dimension(1) * 16); +- shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f)); +- +- // Validate interleave kernel +- auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a)); +- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b)); +- +- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info)); +- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info)); +- } +- } +- +- if (!run_optimised_requantized) +- { +- TensorInfo info_vector_sum_col{}; +- TensorInfo info_vector_sum_row{}; +- +- // Validate matrix B reduction kernel only if _a_offset is not equal to 0 +- if (a_offset != 0) +- { +- info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); +- +- // Configure Matrix B reduction kernel +- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate( +- b, &info_vector_sum_col, a->dimension(0), false)); +- } +- +- // Validate Matrix A reduction kernel only if _b_offset is not equal to 0 +- if (b_offset != 0) +- { +- info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); +- +- // Configure matrix A reduction kernel +- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate( +- a_to_use, &info_vector_sum_row, a->dimension(0), false)); +- } +- +- if (fuse_output_stage) +- { +- if (!run_optimised) +- { +- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate( +- matrix_a_info, matrix_b_info, &mm_result_s32_info)); +- } +- +- // Validate offset contribution kernel +- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate( +- &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col, +- b_offset == 0 ? nullptr : &info_vector_sum_row, c, output, a_offset, b_offset, +- info.gemmlowp_output_stage())); +- } +- else +- { +- if (!run_optimised) +- { +- ARM_COMPUTE_RETURN_ON_ERROR( +- NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output)); +- } +- // Validate offset contribution kernel +- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate( +- output, a_offset == 0 ? nullptr : &info_vector_sum_col, +- b_offset == 0 ? nullptr : &info_vector_sum_row, a_offset, b_offset)); +- } +- } +- return Status{}; +-} +- +-void NEGEMMLowpMatrixMultiplyCoreEx::run() +-{ +- prepare(); +- +- MemoryGroupResourceScope scope_mg(_memory_group); +- +- // Reshape inputs +- if (_mtx_a_reshape_kernel) +- { +- NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY); +- } +- if (_mtx_b_reshape_kernel && !_reshape_b_only_on_first_run) +- { +- NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); +- } +- +- // Run GEMM +- if (_asm_glue.is_configured()) +- { +- _asm_glue.run(); +- } +- else +- { +- NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY); +- } +- +- if (!_fused_assembly_path) +- { +- // Run matrix A reduction kernel only if _b_offset is not equal to 0 +- if (_b_offset != 0) +- { +- NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX); +- } +- +- // Run matrix B reduction kernel only if _a_offset is not equal to 0 +- if (_a_offset != 0 && !_reshape_b_only_on_first_run) +- { +- NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX); +- } +- +- if (_fuse_output_stage) +- { +- // Run offset contribution kernel +- NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY); +- } +- else +- { +- // Run offset contribution kernel +- NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY); +- } +- } +-} +- +-void NEGEMMLowpMatrixMultiplyCoreEx::prepare() +-{ +- if (!_is_prepared) +- { +- // Run assembly reshape +- if (_asm_glue.is_configured() && _reshape_b_only_on_first_run) +- { +- ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); +- +- _asm_glue.prepare(); +- _original_b->mark_as_unused(); +- } +- // Run non-assembly reshape +- else if (_mtx_b_reshape_kernel && _reshape_b_only_on_first_run) +- { +- ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); +- +- // Run reshape kernel and mark original weights tensor as unused +- _tmp_b.allocator()->allocate(); +- NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); +- _original_b->mark_as_unused(); +- } +- +- // Run matrix B reduction kernel only if _a_offset is not equal to 0 +- if (_a_offset != 0 && _reshape_b_only_on_first_run) +- { +- _vector_sum_col.allocator()->allocate(); +- NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX); +- } +- +- _is_prepared = true; +- } +-} +diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp +index c8bb88a..433c35d 100644 +--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp ++++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp +@@ -41,7 +41,7 @@ + #include "arm_compute/runtime/NEON/functions/NEGatherEx.h" + + #include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h" +-#include "support/ToolchainSupport.h" ++#include "support/MemorySupport.h" + + #include + +@@ -49,7 +49,7 @@ namespace arm_compute + { + void NEGatherEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis) + { +- auto k = arm_compute::support::cpp14::make_unique(); ++ auto k = support::cpp14::make_unique(); + k->configure(input, indices, output, axis); + _kernel = std::move(k); + } +diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp +index 078019f..52d58ac 100644 +--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp ++++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp +@@ -41,14 +41,14 @@ + #include "arm_compute/runtime/NEON/functions/NEHashtableLookup.h" + + #include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h" +-#include "support/ToolchainSupport.h" ++#include "support/MemorySupport.h" + + using namespace arm_compute; + + void NEHashtableLookup::configure(const ITensor *lookups, const ITensor *keys, const ITensor *input, + ITensor *output, ITensor *hits) + { +- auto k = arm_compute::support::cpp14::make_unique(); ++ auto k = support::cpp14::make_unique(); + k->configure(lookups, keys, input, output, hits); + _kernel = std::move(k); + } +diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp +deleted file mode 100644 +index dac3b84..0000000 +--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp ++++ /dev/null +@@ -1,55 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2018-2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/runtime/NEON/functions/NEPReLU.h" +- +-#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h" +-#include "support/ToolchainSupport.h" +- +-#include +- +-using namespace arm_compute; +- +-void NEPReLU::configure(const ITensor *input, const ITensor *alpha, ITensor *output) +-{ +- auto k = arm_compute::support::cpp14::make_unique(); +- k->configure(input, alpha, output); +- _kernel = std::move(k); +-} +diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp +deleted file mode 100644 +index 0e9a5e9..0000000 +--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp ++++ /dev/null +@@ -1,161 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2018-2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/runtime/NEON/functions/NERNNLayerEx.h" +- +-#include "arm_compute/core/Error.h" +-#include "arm_compute/core/TensorInfo.h" +-#include "arm_compute/core/Types.h" +-#include "arm_compute/core/Validate.h" +-#include "arm_compute/core/utils/misc/ShapeCalculator.h" +-#include "arm_compute/runtime/NEON/NEScheduler.h" +- +-namespace arm_compute +-{ +-NERNNLayerEx::NERNNLayerEx(std::shared_ptr memory_manager) +- : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), +- _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), +- _gemm_output(), _add_output(), _is_prepared(false) +-{ +-} +- +-Status NERNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights, +- const ITensorInfo *recurrent_weights, const ITensorInfo *bias, +- const ITensorInfo *hidden_state, const ITensorInfo *output, +- const ActivationLayerInfo &info) +-{ +- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, +- output); +- +- const int idx_width = 0; +- const int idx_height = 1; +- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width)); +- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) != +- recurrent_weights->dimension(idx_width)); +- ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) != +- recurrent_weights->dimension(idx_height)); +- ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1); +- ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height)); +- ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height)); +- ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height)); +- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), +- hidden_state->tensor_shape()); +- +- auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape( +- recurrent_weights, hidden_state->dimension(idx_height)), +- 1, input->data_type()); +- +- ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info)); +- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate( +- &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); +- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&shape_info, &shape_info, info)); +- +- return Status{}; +-} +- +-void NERNNLayerEx::configure(const ITensor *input, const ITensor *weights, +- const ITensor *recurrent_weights, const ITensor *bias, +- ITensor *hidden_state, ITensor *output, ActivationLayerInfo &info) +-{ +- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); +- ARM_COMPUTE_ERROR_THROW_ON(NERNNLayerEx::validate(input->info(), weights->info(), +- recurrent_weights->info(), bias->info(), +- hidden_state->info(), output->info(), info)); +- +- const int idx_height = 1; +- TensorShape shape = misc::shape_calculator::compute_rnn_shape( +- recurrent_weights->info(), hidden_state->info()->dimension(idx_height)); +- +- _is_prepared = false; +- +- // Manage intermediate buffers and configure +- _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); +- _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); +- +- // Manage intermediate buffers and configure +- _memory_group.manage(&_fully_connected_out); +- _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out); +- +- _memory_group.manage(&_gemm_output); +- _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f); +- +- _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); +- _memory_group.manage(&_add_output); +- +- _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output, +- ConvertPolicy::SATURATE); +- +- _fully_connected_out.allocator()->allocate(); +- _gemm_output.allocator()->allocate(); +- +- _activation_kernel.configure(&_add_output, hidden_state, info); +- _add_output.allocator()->allocate(); +- +- _copy_kernel.configure(hidden_state, output); +-} +- +-void NERNNLayerEx::run() +-{ +- prepare(); +- +- MemoryGroupResourceScope scope_mg(_memory_group); +- +- _fully_connected_kernel.run(); +- +- _gemm_state_f.run(); +- +- NEScheduler::get().schedule(&_add_kernel, Window::DimY); +- NEScheduler::get().schedule(&_activation_kernel, Window::DimY); +- +- // copy hidden out to output +- NEScheduler::get().schedule(&_copy_kernel, Window::DimY); +-} +- +-void NERNNLayerEx::prepare() +-{ +- if (!_is_prepared) +- { +- _fully_connected_kernel.prepare(); +- _gemm_state_f.prepare(); +- +- _is_prepared = true; +- } +-} +-} // namespace arm_compute +diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp +deleted file mode 100644 +index 116bba3..0000000 +--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp ++++ /dev/null +@@ -1,180 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2018-2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/runtime/NEON/functions/NEReduceMeanEx.h" +- +-#include "arm_compute/core/Helpers.h" +-#include "arm_compute/runtime/NEON/NEScheduler.h" +- +-using namespace arm_compute; +- +-NEReduceMeanEx::NEReduceMeanEx(std::shared_ptr memory_manager) +- : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), +- _reduction_ops(), _keep_dims() +-{ +-} +- +-Status NEReduceMeanEx::validate(const ITensorInfo *input, const Coordinates &reduction_axis, +- bool keep_dims, const ITensorInfo *output) +-{ +- ARM_COMPUTE_UNUSED(keep_dims); +- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); +- ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); +- +- TensorShape out_shape = input->tensor_shape(); +- const unsigned int reduction_ops = reduction_axis.num_dimensions(); +- const int input_dims = input->num_dimensions(); +- Coordinates axis_local = reduction_axis; +- +- // Convert negative axis +- for (unsigned int i = 0; i < reduction_ops; ++i) +- { +- axis_local[i] = wrap_around(axis_local[i], input_dims); +- } +- +- std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); +- for (unsigned int i = 0; i < reduction_ops; ++i) +- { +- ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); +- ARM_COMPUTE_RETURN_ERROR_ON(static_cast(axis_local[i]) > +- input->num_dimensions() - 1); +- if (output->total_size() > 0 && keep_dims) +- { +- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); +- } +- if (keep_dims) +- { +- out_shape.set(axis_local[i], 1); +- } +- else +- { +- out_shape.remove_dimension(axis_local[i] - i); +- } +- } +- const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); +- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); +- +- return Status{}; +-} +- +-void NEReduceMeanEx::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, +- ITensor *output) +-{ +- ARM_COMPUTE_ERROR_ON_NULLPTR(input); +- +- _reduction_ops = reduction_axis.num_dimensions(); +- _reduction_kernels = +- arm_compute::support::cpp14::make_unique(_reduction_ops); +- _reduced_outs = +- arm_compute::support::cpp14::make_unique(_reduction_ops - (keep_dims ? 1 : 0)); +- _keep_dims = keep_dims; +- +- Coordinates axis_local = reduction_axis; +- const int input_dims = input->info()->num_dimensions(); +- const unsigned int reduction_ops = reduction_axis.num_dimensions(); +- +- // Convert negative axis +- for (unsigned int i = 0; i < reduction_ops; ++i) +- { +- axis_local[i] = wrap_around(axis_local[i], input_dims); +- } +- +- // Perform reduction for every axis +- for (unsigned int i = 0; i < _reduction_ops; ++i) +- { +- TensorShape out_shape = i == 0 ? input->info()->tensor_shape() +- : (_reduced_outs.get() + i - 1)->info()->tensor_shape(); +- out_shape.set(axis_local[i], 1); +- auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1); +- +- if (i == _reduction_ops - 1 && keep_dims) +- { +- _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM); +- } +- else +- { +- _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), +- input->info()->data_type(), +- input->info()->quantization_info()) +- .set_data_layout(output->info()->data_layout())); +- _memory_group.manage(_reduced_outs.get() + i); +- _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i], +- ReductionOperation::MEAN_SUM); +- } +- } +- +- // Allocate intermediate tensors +- for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) +- { +- _reduced_outs[i].allocator()->allocate(); +- } +- +- // Configure reshape layer if we want to drop the dimensions +- if (!keep_dims) +- { +- TensorShape out_shape = input->info()->tensor_shape(); +- +- // We have to sort the reduction axis vectors in order for remove_dimension +- // to work properly +- std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); +- for (unsigned int i = 0; i < _reduction_ops; ++i) +- { +- out_shape.remove_dimension(axis_local[i] - i); +- } +- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape)); +- _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output); +- } +-} +- +-void NEReduceMeanEx::run() +-{ +- _memory_group.acquire(); +- +- for (unsigned int i = 0; i < _reduction_ops; ++i) +- { +- _reduction_kernels[i].run(); +- } +- +- if (!_keep_dims) +- { +- _reshape.run(); +- } +- _memory_group.release(); +-} +diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp +deleted file mode 100644 +index 198bb76..0000000 +--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp ++++ /dev/null +@@ -1,114 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h" +- +-#include "arm_compute/core/Error.h" +-#include "arm_compute/core/TensorInfo.h" +-#include "arm_compute/core/Types.h" +-#include "arm_compute/core/Validate.h" +-#include "arm_compute/runtime/NEON/NEScheduler.h" +- +-namespace arm_compute +-{ +-NESpaceToBatchLayerEx::NESpaceToBatchLayerEx() +- : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false) +-{ +-} +- +-void NESpaceToBatchLayerEx::configure(const ITensor *input, const ITensor *block_shape, +- const ITensor *paddings, ITensor *output) +-{ +- ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output); +- +- if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) +- { +- _has_padding = true; +- _memset_kernel.configure( +- output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info())); +- } +- _space_to_batch_kernel.configure(input, block_shape, paddings, output); +-} +- +-void NESpaceToBatchLayerEx::configure(const ITensor *input, const int block_shape_x, +- const int block_shape_y, const Size2D &padding_left, +- const Size2D &padding_right, ITensor *output) +-{ +- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); +- +- if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) +- { +- _has_padding = true; +- _memset_kernel.configure( +- output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info())); +- } +- _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, +- output); +-} +- +-Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const ITensorInfo *block_shape, +- const ITensorInfo *paddings, const ITensorInfo *output) +-{ +- ARM_COMPUTE_RETURN_ON_ERROR( +- NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output)); +- +- return Status{}; +-} +- +-Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const int block_shape_x, +- const int block_shape_y, const Size2D &padding_left, +- const Size2D &padding_right, const ITensorInfo *output) +-{ +- ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate( +- input, block_shape_x, block_shape_y, padding_left, padding_right, output)); +- +- return Status{}; +-} +- +-void NESpaceToBatchLayerEx::run() +-{ +- // Zero out output only if we have paddings +- if (_has_padding) +- { +- NEScheduler::get().schedule(&_memset_kernel, Window::DimY); +- } +- NEScheduler::get().schedule(&_space_to_batch_kernel, Window::DimY); +-} +-} // namespace arm_compute +diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp +deleted file mode 100644 +index 97697e3..0000000 +--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp ++++ /dev/null +@@ -1,64 +0,0 @@ +-/* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2019 ARM Limited. +- * +- * SPDX-License-Identifier: MIT +- * +- * Permission is hereby granted, free of charge, to any person obtaining a copy +- * of this software and associated documentation files (the "Software"), to +- * deal in the Software without restriction, including without limitation the +- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +- * sell copies of the Software, and to permit persons to whom the Software is +- * furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in all +- * copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +- * SOFTWARE. +- */ +- +-#include "arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h" +- +-#include "arm_compute/core/Error.h" +-#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h" +-#include "arm_compute/core/TensorInfo.h" +-#include "arm_compute/core/Types.h" +-#include "arm_compute/core/Validate.h" +- +-namespace arm_compute +-{ +-void NESpaceToDepthLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape) +-{ +- auto k = arm_compute::support::cpp14::make_unique(); +- k->configure(input, output, block_shape); +- _kernel = std::move(k); +-} +- +-Status NESpaceToDepthLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, +- int32_t block_shape) +-{ +- ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToDepthLayerKernelEx::validate(input, output, block_shape)); +- return Status{}; +-} +-} // namespace arm_compute +diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp +index df06892..09f1780 100644 +--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp ++++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp +@@ -1,21 +1,5 @@ + /* +- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * Copyright (c) 2017-2019 ARM Limited. ++ * Copyright (c) 2017-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * +@@ -37,14 +21,11 @@ + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +- + #include "arm_compute/runtime/NEON/functions/NETransposeConvLayer.h" + + #include "arm_compute/core/Helpers.h" +-#include "arm_compute/core/Utils.h" + #include "arm_compute/core/UtilsEx.h" + #include "arm_compute/core/Validate.h" +-#include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" + #include "arm_compute/runtime/NEON/NEScheduler.h" + +@@ -52,20 +33,15 @@ using namespace arm_compute::misc::shape_calculator; + + namespace arm_compute + { ++ + NETransposeConvLayer::NETransposeConvLayer(std::shared_ptr memory_manager) // NOLINT + : _memory_group(std::move(memory_manager)), + _conv_f(), + _upsample_f(), + _flip_weights(), +- _permute_input(), +- _permute_weights(), +- _permute_output(), + _scaled_output(), + _weights_flipped(), +- _permuted_input(), +- _permuted_weights(), +- _permuted_output(), +- _is_nchw(false), ++ _flip_axis(), + _original_weights(nullptr), + _input(nullptr), + _info(), +@@ -80,7 +56,7 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf + { + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, +- DataType::QASYMM8); ++ DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, input); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input); + const unsigned int width_idx = +@@ -95,13 +71,16 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf + weights->dimension(height_idx), info, invalid_right, invalid_bottom); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); +- if (is_data_type_quantized_asymmetric(input->data_type()) && bias) ++ if (bias != nullptr) + { +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); +- } +- else if (bias) +- { +- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); ++ if (is_data_type_quantized_asymmetric(input->data_type())) ++ { ++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); ++ } ++ else ++ { ++ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); ++ } + } + + if (output->tensor_shape().total_size() > 0) +@@ -110,12 +89,12 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf + + const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights); + +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) < output_shape.x(), +- "Output's dim 0 is invalid."); +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) < output_shape.y(), +- "Output's dim 1 is invalid."); +- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) < output_shape.z(), +- "Output's dim 2 is invalid."); ++ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), ++ "Output's width is invalid."); ++ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), ++ "Output's height is invalid."); ++ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), ++ "Output's depth is invalid."); + } + + unsigned int pad_left = 0; +@@ -127,7 +106,6 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf + pad_bottom); + TensorInfo scale_out_info( + input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape)); +- scale_out_info.set_data_layout(input->data_layout()); + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + + const unsigned int batches_idx = +@@ -149,19 +127,13 @@ void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, con + ITensor *output, const PadStrideInfo &info, + unsigned int invalid_right, unsigned int invalid_bottom) + { ++ // Perform validation step + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ++ ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate( ++ input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), ++ info, invalid_right, invalid_bottom)); + + const DataLayout data_layout = input->info()->data_layout(); +- +- _input = input; +- _original_weights = weights; +- _info = info; +- _is_prepared = false; +- _is_nchw = data_layout == DataLayout::NCHW; +- +- const unsigned int stride_x = info.stride().first; +- const unsigned int stride_y = info.stride().second; +- + const unsigned int width_idx = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = +@@ -173,101 +145,54 @@ void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, con + + const TensorShape output_shape = + compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); ++ ++ _input = input; ++ _original_weights = weights; ++ _info = info; ++ _is_prepared = false; ++ ++ unsigned int pad_left = 0; ++ unsigned int pad_right = 0; ++ unsigned int pad_top = 0; ++ unsigned int pad_bottom = 0; ++ const unsigned int stride_x = info.stride().first; ++ const unsigned int stride_y = info.stride().second; ++ + // Output auto initialization if not yet initialized + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + +- // Perform validation step +- ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate( +- input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), +- info, invalid_right, invalid_bottom)); +- ++ _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32)); + _memory_group.manage(&_scaled_output); + +- if (!_is_nchw) +- { +- _memory_group.manage(&_permuted_input); +- _memory_group.manage(&_permuted_weights); +- _memory_group.manage(&_permuted_output); +- +- // Configure the function to transform the input tensor from NHWC -> NCHW +- _permuted_input.info()->set_quantization_info(input->info()->quantization_info()); +- _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U)); +- _permuted_input.info()->set_data_layout(DataLayout::NCHW); +- +- // Configure the function to transform the weights tensor from NHWC -> NCHW +- _permuted_weights.info()->set_quantization_info(weights->info()->quantization_info()); +- _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U)); +- _permuted_weights.info()->set_data_layout(DataLayout::NCHW); +- +- // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in +- // order to match output shape +- +- unsigned int pad_left = 0; +- unsigned int pad_right = 0; +- unsigned int pad_top = 0; +- unsigned int pad_bottom = 0; +- const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( +- *_permuted_input.info(), *_permuted_weights.info(), info, out_dims, invalid_right, +- invalid_bottom, pad_left, pad_right, pad_top, pad_bottom); +- +- TensorInfo scale_out_info(scale_out_shape, 1, _permuted_input.info()->data_type(), +- _permuted_input.info()->quantization_info()); +- scale_out_info.set_data_layout(DataLayout::NCHW); +- _scaled_output.allocator()->init(scale_out_info); +- +- const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, +- DimensionRoundingType::CEIL); +- _upsample_f.configure(&_permuted_input, &_scaled_output, upsample_info); +- +- _weights_flipped.allocator()->init(*_permuted_weights.info()->clone()); +- _weights_flipped.info()->set_quantization_info(weights->info()->quantization_info()); +- _flip_weights.configure(&_permuted_weights, &_weights_flipped); +- +- // setup the function to convolve the upscaled output +- const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); +- +- const auto out_shape = output->info()->tensor_shape(); +- TensorShape permuted_out_shape{out_shape[1], out_shape[2], out_shape[0], out_shape[3]}; +- TensorInfo permuted_out_info(permuted_out_shape, 1, output->info()->data_type(), +- output->info()->quantization_info()); +- _permuted_output.allocator()->init(permuted_out_info); +- _permuted_output.info()->set_data_layout(DataLayout::NCHW); +- _conv_f.configure(&_scaled_output, &_weights_flipped, bias, &_permuted_output, conv_info); +- +- // Configure the function to transform the convoluted output to NHWC +- _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U)); +- +- _permuted_input.allocator()->allocate(); +- _permuted_weights.allocator()->allocate(); +- _permuted_output.allocator()->allocate(); +- } +- else +- { +- // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in +- // order to match output shape +- unsigned int pad_left = 0; +- unsigned int pad_right = 0; +- unsigned int pad_top = 0; +- unsigned int pad_bottom = 0; +- const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( +- *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, +- pad_right, pad_top, pad_bottom); +- +- TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), +- input->info()->quantization_info()); +- _scaled_output.allocator()->init(scale_out_info); +- const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, +- DimensionRoundingType::FLOOR); +- _upsample_f.configure(input, &_scaled_output, upsample_info); +- +- _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); +- _flip_weights.configure(weights, &_weights_flipped); +- +- // setup the function to convolve the upscaled output +- const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); +- _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info); +- } ++ _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); ++ _flip_weights.configure(weights, &_weights_flipped, &_flip_axis); ++ ++ // setup the function to convolve the upscaled output ++ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); ++ ++ const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( ++ *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, ++ pad_right, pad_top, pad_bottom); ++ ++ const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, ++ DimensionRoundingType::FLOOR); ++ ++ TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), ++ input->info()->quantization_info()); ++ scale_out_info.set_data_layout(data_layout); ++ _scaled_output.allocator()->init(scale_out_info); ++ ++ _upsample_f.configure(input, &_scaled_output, upsample_info); ++ ++ _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info); ++ ++ // Setup flip axis data ++ _flip_axis.allocator()->allocate(); ++ auto axis_data = reinterpret_cast(_flip_axis.buffer()); ++ axis_data[0] = static_cast(width_idx); ++ axis_data[1] = static_cast(height_idx); ++ + _scaled_output.allocator()->allocate(); + } + +@@ -275,22 +200,10 @@ void NETransposeConvLayer::run() + { + prepare(); + +- // MemoryGroupResourceScope scope_mg(_memory_group); +- +- // Permute input +- if (!_is_nchw) +- { +- _permute_input.run(); +- } ++ MemoryGroupResourceScope scope_mg(_memory_group); + + _upsample_f.run(); + _conv_f.run(); +- +- // Permute output +- if (!_is_nchw) +- { +- _permute_output.run(); +- } + } + + void NETransposeConvLayer::prepare() +@@ -301,22 +214,12 @@ void NETransposeConvLayer::prepare() + + // Run weights flipping and mark original weights tensor as unused + _weights_flipped.allocator()->allocate(); +- // Permute weights +- if (!_is_nchw) +- { +- _permute_weights.run(); +- } +- NEScheduler::get().schedule(&_flip_weights, Window::DimZ); ++ _flip_weights.run(); + _original_weights->mark_as_unused(); + + // Prepare convolution + _conv_f.prepare(); + +- if (!_weights_flipped.is_used()) +- { +- _weights_flipped.allocator()->free(); +- } +- + _is_prepared = true; + } + } +diff --git a/compute/cker/CMakeLists.txt b/compute/cker/CMakeLists.txt +index 09f6725..609dd45 100644 +--- a/compute/cker/CMakeLists.txt ++++ b/compute/cker/CMakeLists.txt +@@ -8,6 +8,9 @@ target_link_libraries(nnfw_lib_cker INTERFACE gemmlowp) + target_link_libraries(nnfw_lib_cker INTERFACE ruy) + target_link_libraries(nnfw_lib_cker INTERFACE ruy_instrumentation) + target_compile_definitions(nnfw_lib_cker INTERFACE USE_RUY_GEMV) ++if(EXPERIMENTAL_RUY_FEATURE) ++ target_compile_definitions(nnfw_lib_cker INTERFACE EXPERIMENTAL_RUY_FEATURE) ++endif(EXPERIMENTAL_RUY_FEATURE) + if(PROFILE_RUY) + target_link_libraries(nnfw_lib_cker INTERFACE ruy_profiler) + endif(PROFILE_RUY) +diff --git a/compute/cker/include/cker/Types.h b/compute/cker/include/cker/Types.h +index 41b1916..1bde640 100644 +--- a/compute/cker/include/cker/Types.h ++++ b/compute/cker/include/cker/Types.h +@@ -259,6 +259,12 @@ struct FullyConnectedParams + // FullyConnectedWeightsFormat weights_format; + }; + ++struct L2NormParams ++{ ++ // uint8 inference params. ++ int32_t input_zero_point; ++}; ++ + struct GatherParams + { + int32_t axis; +@@ -338,6 +344,11 @@ struct SpaceToBatchParams + int32_t output_offset; + }; + ++struct SpaceToDepthParams ++{ ++ int32_t block_size; ++}; ++ + enum class Order + { + kColMajor, +diff --git a/compute/cker/include/cker/Utils.h b/compute/cker/include/cker/Utils.h +index b69d55c..2abb998 100644 +--- a/compute/cker/include/cker/Utils.h ++++ b/compute/cker/include/cker/Utils.h +@@ -123,6 +123,68 @@ inline int CountLeadingZeros(uint32_t integer_input) + return leading_zeros; + } + ++inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift, ++ int32_t *output_inv_sqrt, int *output_shift) ++{ ++ assert(input >= 0); ++ if (input <= 1) ++ { ++ // Handle the input value 1 separately to avoid overflow in that case ++ // in the general computation below (b/143972021). Also handle 0 as if it ++ // were a 1. 0 is an invalid input here (divide by zero) and 1 is a valid ++ // but rare/unrealistic input value. We can expect both to occur in some ++ // incompletely trained models, but probably not in fully trained models. ++ *output_inv_sqrt = std::numeric_limits::max(); ++ *output_shift = 0; ++ return; ++ } ++ assert(input > 1); ++ *output_shift = 11; ++ while (input >= (1 << 29)) ++ { ++ input /= 4; ++ ++*output_shift; ++ } ++ const unsigned max_left_shift_bits = CountLeadingZeros(static_cast(input)) - 1; ++ const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2; ++ const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1; ++ *output_shift -= left_shift_bit_pairs; ++ input <<= 2 * left_shift_bit_pairs; ++ assert(input >= (1 << 27)); ++ assert(input < (1 << 29)); ++ using gemmlowp::FixedPoint; ++ using gemmlowp::Rescale; ++ using gemmlowp::SaturatingRoundingMultiplyByPOT; ++ // Using 3 integer bits gives us enough room for the internal arithmetic in ++ // this Newton-Raphson iteration. ++ using F3 = FixedPoint; ++ using F0 = FixedPoint; ++ const F3 fixedpoint_input = F3::FromRaw(input >> 1); ++ const F3 fixedpoint_half_input = SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input); ++ const F3 fixedpoint_half_three = ++ GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5); ++ // Newton-Raphson iteration ++ // Naive unoptimized starting guess: x = 1 ++ F3 x = F3::One(); ++ // Naive unoptimized number of iterations: 5 ++ for (int i = 0; i < 5; i++) ++ { ++ const F3 x3 = Rescale<3>(x * x * x); ++ x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3); ++ } ++ const F0 fixedpoint_half_sqrt_2 = ++ GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.); ++ x = x * fixedpoint_half_sqrt_2; ++ *output_inv_sqrt = x.raw(); ++ if (*output_shift < 0) ++ { ++ *output_inv_sqrt <<= -*output_shift; ++ *output_shift = 0; ++ } ++ // Convert right shift (right is positive) to left shift. ++ *output_shift *= reverse_shift; ++} ++ + // Comment from tensorflow lite: + // + // DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING +diff --git a/compute/cker/include/cker/operation/FullyConnected.h b/compute/cker/include/cker/operation/FullyConnected.h +index 9bcf3fd..9b72811 100644 +--- a/compute/cker/include/cker/operation/FullyConnected.h ++++ b/compute/cker/include/cker/operation/FullyConnected.h +@@ -78,8 +78,11 @@ inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &inpu + MatrixBatchVectorMultiplyAccumulate(weights_data, num_units, input_size, input_data, batch_size, + output_data, /*result_stride=*/1); + +- // Apply activation function +- ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data); ++ if (params.activation != FusedActivationFunctionType::kNone) ++ { ++ // Apply activation function ++ ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data); ++ } + } + + inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &input_shape, +@@ -195,7 +198,11 @@ inline void FullyConnectedHybrid(const FullyConnectedParams ¶ms, const Shape + #endif + + // Apply activation function to floats. +- ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data); ++ if (params.activation != FusedActivationFunctionType::kNone) ++ { ++ // Apply activation function ++ ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data); ++ } + return; + } + +diff --git a/compute/cker/include/cker/operation/L2Normalize.h b/compute/cker/include/cker/operation/L2Normalize.h +new file mode 100644 +index 0000000..a0075c3 +--- /dev/null ++++ b/compute/cker/include/cker/operation/L2Normalize.h +@@ -0,0 +1,94 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright 2017 The TensorFlow Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#ifndef __NNFW_CKER_L2NORMALIZE_H__ ++#define __NNFW_CKER_L2NORMALIZE_H__ ++ ++#include "cker/Shape.h" ++#include "cker/Utils.h" ++#include "cker/Types.h" ++ ++namespace nnfw ++{ ++namespace cker ++{ ++ ++void L2NormalizeFloat32(const Shape &input_shape, const float *input_data, ++ const Shape &output_shape, float *output_data) ++{ ++ float epsilon = 1e-6; ++ const int trailing_dim = input_shape.DimensionsCount() - 1; ++ const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); ++ const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); ++ for (int i = 0; i < outer_size; ++i) ++ { ++ float squared_l2_norm = 0; ++ for (int c = 0; c < depth; ++c) ++ { ++ const float val = input_data[c]; ++ squared_l2_norm += val * val; ++ } ++ float l2_norm = std::sqrt(squared_l2_norm); ++ l2_norm = std::max(l2_norm, epsilon); ++ for (int c = 0; c < depth; ++c) ++ { ++ *output_data = *input_data / l2_norm; ++ ++output_data; ++ ++input_data; ++ } ++ } ++} ++ ++void L2NormalizeQuant8(L2NormParams ¶ms, const Shape &input_shape, const uint8_t *input_data, ++ const Shape &output_shape, uint8_t *output_data) ++{ ++ const int trailing_dim = input_shape.DimensionsCount() - 1; ++ const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); ++ const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); ++ const int32_t input_zero_point = params.input_zero_point; ++ ++ for (int i = 0; i < outer_size; ++i) ++ { ++ int32_t square_l2_norm = 0; ++ for (int c = 0; c < depth; c++) ++ { ++ // Note that input_data advances by depth in the second pass below. ++ int32_t diff = input_data[c] - input_zero_point; ++ square_l2_norm += diff * diff; ++ } ++ int32_t inv_l2norm_multiplier; ++ int inv_l2norm_shift; ++ GetInvSqrtQuantizedMultiplierExp(square_l2_norm, -1, &inv_l2norm_multiplier, &inv_l2norm_shift); ++ for (int c = 0; c < depth; c++) ++ { ++ int32_t diff = *input_data - input_zero_point; ++ int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp( ++ 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift); ++ int32_t unclamped_output_val = 128 + rescaled_diff; ++ int32_t output_val = std::min(static_cast(255), ++ std::max(static_cast(0), unclamped_output_val)); ++ *output_data = static_cast(output_val); ++ ++input_data; ++ ++output_data; ++ } ++ } ++} ++ ++} // namespace cker ++} // namespace nnfw ++ ++#endif // __NNFW_CKER_L2NORMALIZE_H__ +diff --git a/compute/cker/include/cker/operation/Logistic.h b/compute/cker/include/cker/operation/Logistic.h +index 7477858..3d3e59e 100644 +--- a/compute/cker/include/cker/operation/Logistic.h ++++ b/compute/cker/include/cker/operation/Logistic.h +@@ -32,18 +32,9 @@ namespace cker + inline void Logistic(const Shape &input_shape, const float *input_data, const Shape &output_shape, + float *output_data) + { +-#ifdef __aarch64__ + auto input_map = MapAsVector(input_data, input_shape); + auto output_map = MapAsVector(output_data, output_shape); + output_map.array() = input_map.array().unaryExpr(Eigen::internal::scalar_logistic_op()); +-#else +- // Note, this can be done using TANH: (1/2) + (1/2) * TANH(x/2) +- const int size = MatchingFlatSize(input_shape, output_shape); +- for (int i = 0; i < size; i++) +- { +- output_data[i] = 1.f / (1.f + std::exp(-input_data[i])); +- } +-#endif + } + + } // namespace cker +diff --git a/compute/cker/include/cker/operation/Pad.h b/compute/cker/include/cker/operation/Pad.h +index af432f3..4a2732d 100644 +--- a/compute/cker/include/cker/operation/Pad.h ++++ b/compute/cker/include/cker/operation/Pad.h +@@ -26,9 +26,10 @@ namespace nnfw + { + namespace cker + { ++template + inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &input_shape, +- const float *input_data, const Shape &output_shape, float *output_data, +- const float *constant_value_data) ++ const T *input_data, const Shape &output_shape, T *output_data, ++ const T *constant_value_data) + { + // Note, this is pad with mode=`CONSTANT`: it doesn't support `REFLECT` and `SYMMETRIC` + // TODO: come up with more subtle solution that uses subtensors like arm compute +@@ -38,7 +39,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu + /** List of padding information */ + using PaddingList = std::vector; + +- auto constant_value = constant_value_data ? *constant_value_data : 0; ++ const T constant_value = constant_value_data ? *constant_value_data : 0; + assert(output_shape.DimensionsCount() == input_shape.DimensionsCount()); + + PaddingList padding_list(pad_rank); +@@ -64,7 +65,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu + { + const int32_t in_row_len = input_shape.Dims(0); + std::fill_n(output_data, padding_list[0].first, constant_value); +- std::memcpy(output_data + padding_list[0].first, input_data, in_row_len * sizeof(float)); ++ std::memcpy(output_data + padding_list[0].first, input_data, in_row_len * sizeof(T)); + std::fill_n(output_data + padding_list[0].first + in_row_len, padding_list[0].second, + constant_value); + break; +@@ -89,7 +90,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu + out_offset += padding_list[1].first; + + // copy a row of input data +- memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(float)); ++ memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(T)); + + out_offset += in_row_len; + +@@ -132,7 +133,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu + out_offset += padding_list[2].first; + + // copy a row of input data +- memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(float)); ++ memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(T)); + + out_offset += in_row_len; + +@@ -191,7 +192,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu + out_c_offset += padding_list[3].first; + + // copy a row of input data +- memcpy(output_data + out_c_offset, input_data + in_offset, in_row_len * sizeof(float)); ++ memcpy(output_data + out_c_offset, input_data + in_offset, in_row_len * sizeof(T)); + + out_c_offset += in_row_len; + +diff --git a/compute/cker/include/cker/operation/Quantize.h b/compute/cker/include/cker/operation/Quantize.h +new file mode 100644 +index 0000000..5c82d11 +--- /dev/null ++++ b/compute/cker/include/cker/operation/Quantize.h +@@ -0,0 +1,47 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#ifndef __NNFW_CKER_QUANTIZE_H__ ++#define __NNFW_CKER_QUANTIZE_H__ ++ ++#include "cker/Shape.h" ++#include "cker/Types.h" ++#include "cker/Utils.h" ++#include ++#include ++namespace nnfw ++{ ++namespace cker ++{ ++template ++inline void Quantize(const Shape &input_shape, const InputT *input_data, const Shape &output_shape, ++ OutputT *output_data, const float output_scale, const int32_t output_offset) ++{ ++ const int flat_size = MatchingFlatSize(input_shape, output_shape); ++ int min_val = std::numeric_limits::min(); ++ int max_val = std::numeric_limits::max(); ++ ++ for (int i = 0; i < flat_size; i++) ++ { ++ int32_t unclamped = static_cast(round(input_data[i] / output_scale)) + output_offset; ++ int32_t clamped = std::min(std::max(unclamped, min_val), max_val); ++ output_data[i] = clamped; ++ } ++} ++} // namespace cker ++} // namespace nnfw ++ ++#endif // __NNFW_CKER_QUANTIZE_H__ +diff --git a/compute/cker/include/cker/operation/SpaceToDepth.h b/compute/cker/include/cker/operation/SpaceToDepth.h +new file mode 100644 +index 0000000..ef67931 +--- /dev/null ++++ b/compute/cker/include/cker/operation/SpaceToDepth.h +@@ -0,0 +1,71 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright 2017 The TensorFlow Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#ifndef __NNFW_CKER_SPACE_TO_DEPTH_H__ ++#define __NNFW_CKER_SPACE_TO_DEPTH_H__ ++ ++#include "cker/Shape.h" ++#include "cker/Types.h" ++ ++namespace nnfw ++{ ++namespace cker ++{ ++ ++template ++inline void SpaceToDepth(const SpaceToDepthParams ¶ms, const Shape &unextended_input_shape, ++ const T *input_data, const Shape &unextended_output_shape, T *output_data) ++{ ++ assert(unextended_input_shape.DimensionsCount() <= 4); ++ assert(unextended_output_shape.DimensionsCount() <= 4); ++ const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape); ++ const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape); ++ ++ const int output_depth = output_shape.Dims(3); ++ const int output_width = output_shape.Dims(2); ++ const int output_height = output_shape.Dims(1); ++ ++ const int input_depth = input_shape.Dims(3); ++ const int batch_size = input_shape.Dims(0); ++ ++ // Number of continuous values that we can copy in one interation. ++ const int stride = params.block_size * input_depth; ++ ++ for (int batch = 0; batch < batch_size; ++batch) ++ { ++ for (int out_h = 0; out_h < output_height; ++out_h) ++ { ++ T *output_ptr = output_data + Offset(output_shape, batch, out_h, 0, 0); ++ for (int offset_h = 0; offset_h < params.block_size; ++offset_h) ++ { ++ T *dst = output_ptr; ++ for (int out_w = 0; out_w < output_width; ++out_w) ++ { ++ memcpy(dst, input_data, stride * sizeof(T)); ++ input_data += stride; ++ dst += output_depth; ++ } ++ output_ptr += stride; ++ } ++ } ++ } ++} ++ ++} // namespace cker ++} // namespace nnfw ++ ++#endif // __NNFW_CKER_SPACE_TO_DEPTH_H__ +diff --git a/compute/cker/include/cker/ruy/RuySupport.h b/compute/cker/include/cker/ruy/RuySupport.h +index 432b181..080f66f 100644 +--- a/compute/cker/include/cker/ruy/RuySupport.h ++++ b/compute/cker/include/cker/ruy/RuySupport.h +@@ -24,7 +24,7 @@ + + namespace + { +-const int kDefaultNumThreadpoolThreads = 4; ++const int kDefaultNumThreadpoolThreads = 1; + } + + namespace nnfw +diff --git a/docs/howto/how-to-build-runtime.md b/docs/howto/how-to-build-runtime.md +index 2bfd14c..657f0f7 100644 +--- a/docs/howto/how-to-build-runtime.md ++++ b/docs/howto/how-to-build-runtime.md +@@ -13,7 +13,7 @@ In the Ubuntu, you can easily install it with the following command. + + ``` + $ sudo apt-get install cmake libboost-all-dev +-``` ++``` + + If your linux system does not have the basic development configuration, you will need to install more packages. A list of all packages needed to configure the development environment can be found in the https://github.com/Samsung/ONE/blob/master/infra/docker/Dockerfile.1804 file. + +@@ -44,7 +44,7 @@ python3-venv \ + scons \ + software-properties-common \ + unzip \ +-wget ++wget + + $ mkdir /tmp/gtest + $ cd /tmp/gtest +@@ -63,7 +63,7 @@ In a typical linux development environment, including Ubuntu, you can build the + ``` + $ git clone https://github.com/Samsung/ONE.git one + $ cd one +-$ cp -n Makefile.template Makefile; make install ++$ make -f Makefile.template install + ``` + + Unfortunately, the debug build on the x86_64 architecture currently has an error. To solve the problem, you must use gcc version 9 or higher. Another workaround is to do a release build rather than a debug build. This is not a suitable method for debugging during development, but it is enough to check the function of the runtime. To release build the runtime, add the environment variable `BUILD_TYPE=release` to the build command as follows. +diff --git a/docs/nnfw/howto/CrossBuildForAndroid.md b/docs/nnfw/howto/CrossBuildForAndroid.md +index d7e48c8..08d5fd6 100644 +--- a/docs/nnfw/howto/CrossBuildForAndroid.md ++++ b/docs/nnfw/howto/CrossBuildForAndroid.md +@@ -44,11 +44,9 @@ Different from cross build for linux, + Here is an example of using Makefile. + + ```bash +-cp -n Makefile.template Makefile +- + TARGET_OS=android \ + CROSS_BUILD=1 \ + NDK_DIR=/path/android-tools/r20/ndk \ + EXT_ACL_FOLDER=/path/arm_compute-v19.11.1-bin-android/lib/android-arm64-v8a-neon-cl \ +-make install ++make -f Makefile.template install + ``` +diff --git a/docs/runtime/core.md b/docs/runtime/core.md +index 42ba75f..64a6c62 100644 +--- a/docs/runtime/core.md ++++ b/docs/runtime/core.md +@@ -68,7 +68,7 @@ Let's say we have some functions written in a certain programming language. Then + + With generated tensors and kernels, the compiler creates executor objects. There are 3 types of executors are supported - Linear, Dataflow, and Parallel. Linear executor is the default executor and Dataflow Executor and Parallel Executor are experimental. + +-For more about executors, please refer to [Executors](./executors.md) document. ++For more about executors, please refer to [Executors](executors.md) document. + + ### Module `exec` + +@@ -83,4 +83,4 @@ For more about executors, please refer to [Executors](./executors.md) document. + + Backends are plugins and they are loaded dynamically(via `dlopen`). So this module is a set of interface classes for backend implementation. `compiler` can compile with a variety of backends without knowing specific backend implementation. + +-Backend interface classes are mostly about memory management and kernel generation. For more, please refer to [Backend API](./backend-api.md) document. ++Backend interface classes are mostly about memory management and kernel generation. For more, please refer to [Backend API](backend-api.md) document. +diff --git a/docs/runtime/heterogeneous-execution.md b/docs/runtime/heterogeneous-execution.md +index dc39dae..e7a5e27 100644 +--- a/docs/runtime/heterogeneous-execution.md ++++ b/docs/runtime/heterogeneous-execution.md +@@ -12,11 +12,11 @@ Here is another case. Let's say we have a model that is not sequential so there + + ![Add-3Conv model](heterogeneous-execution-add-3-conv-model.png) + +-Say we have 3 backends that are based on CPU, GPU and NPU(Neural Processing Unit) respectively. After executing Add, 3 Conv2D operations are ready to run. We may utilize those backends with [Parallel Executor (experimental)](./executors.md#parallel-executor-experimental). For this case we may get performance gain regardless of kernels' speed as those are run in parallel independently. ++Say we have 3 backends that are based on CPU, GPU and NPU(Neural Processing Unit) respectively. After executing Add, 3 Conv2D operations are ready to run. We may utilize those backends with [Parallel Executor (experimental)](executors.md#parallel-executor-experimental). For this case we may get performance gain regardless of kernels' speed as those are run in parallel independently. + + ## Graph Transformation + +-Unfortunately it is not that simple to get performance gain. As each backend has its own memory management module, a copy must be done between backend boundaries. Plus, it may require layout changes so "Permute" operations are added from `PermutationInsertionPass`. This process is done from [Lowering](./core.md#1-lowering) phase of compilation. ++Unfortunately it is not that simple to get performance gain. As each backend has its own memory management module, a copy must be done between backend boundaries. Plus, it may require layout changes so "Permute" operations are added from `PermutationInsertionPass`. This process is done from [Lowering](core.md#1-lowering) phase of compilation. + + Here is an example of that. Let's say we have assigned different backends for Add and Conv2D. So a Permute operation is inserted between them. + +diff --git a/infra/cmake/packages/ARMComputeSourceConfig.cmake b/infra/cmake/packages/ARMComputeSourceConfig.cmake +index 51a235a..adec1f9 100644 +--- a/infra/cmake/packages/ARMComputeSourceConfig.cmake ++++ b/infra/cmake/packages/ARMComputeSourceConfig.cmake +@@ -8,7 +8,7 @@ function(_ARMComputeSource_import) + nnas_include(OptionTools) + + envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com") +- set(ARMCOMPUTE_URL ${EXTERNAL_DOWNLOAD_SERVER}/ARM-software/ComputeLibrary/archive/v19.11.1.tar.gz) ++ set(ARMCOMPUTE_URL ${EXTERNAL_DOWNLOAD_SERVER}/ARM-software/ComputeLibrary/archive/v20.05.tar.gz) + ExternalSource_Download(ARMCOMPUTE ${ARMCOMPUTE_URL}) + + set(ARMComputeSource_DIR ${ARMCOMPUTE_SOURCE_DIR} PARENT_SCOPE) +diff --git a/infra/cmake/packages/FlatBuffersConfig.cmake b/infra/cmake/packages/FlatBuffersConfig.cmake +index ab0b770..da084e7 100644 +--- a/infra/cmake/packages/FlatBuffersConfig.cmake ++++ b/infra/cmake/packages/FlatBuffersConfig.cmake +@@ -25,7 +25,8 @@ function(_FlatBuffers_build) + BUILD_DIR ${CMAKE_BINARY_DIR}/externals/FLATBUFFERS/build + INSTALL_DIR ${EXT_OVERLAY_DIR} + BUILD_FLAGS ${ADDITIONAL_CXX_FLAGS} +- IDENTIFIER "1.10-fix1" ++ IDENTIFIER "1.10-fix2" ++ EXTRA_OPTS "-DFLATBUFFERS_BUILD_TESTS:BOOL=OFF" + PKG_NAME "FLATBUFFERS") + + endfunction(_FlatBuffers_build) +diff --git a/infra/cmake/packages/HDF5Config.cmake b/infra/cmake/packages/HDF5Config.cmake +index e282e0b..19803f1 100644 +--- a/infra/cmake/packages/HDF5Config.cmake ++++ b/infra/cmake/packages/HDF5Config.cmake +@@ -27,6 +27,7 @@ _HDF5_build() + find_path(HDF5_CONFIG_DIR "hdf5-config.cmake" + PATHS ${EXT_OVERLAY_DIR} + PATH_SUFFIXES ++ cmake + share/cmake + share/cmake/hdf5 + cmake/hdf5 +diff --git a/infra/cmake/packages/Pybind11Config.cmake b/infra/cmake/packages/Pybind11Config.cmake +new file mode 100644 +index 0000000..3061779 +--- /dev/null ++++ b/infra/cmake/packages/Pybind11Config.cmake +@@ -0,0 +1,21 @@ ++function(_Pybind11_import) ++ nnas_find_package(Pybind11Source QUIET) ++ ++ if(NOT Pybind11Source_FOUND) ++ set(Pybind11_FOUND FALSE PARENT_SCOPE) ++ return() ++ endif(NOT Pybind11Source_FOUND) ++ ++ nnas_include(ExternalBuildTools) ++ ExternalBuild_CMake(CMAKE_DIR ${Pybind11Source_DIR} ++ BUILD_DIR ${CMAKE_BINARY_DIR}/externals/PYBIND11/build ++ INSTALL_DIR ${EXT_OVERLAY_DIR} ++ IDENTIFIER "2.3.0" ++ PKG_NAME "PYBIND11") ++ ++ find_path(Pybind11_INCLUDE_DIRS NAMES pybind11.h PATHS ${EXT_OVERLAY_DIR} PATH_SUFFIXES include/pybind11) ++ ++ set(Pybind11_FOUND TRUE PARENT_SCOPE) ++endfunction(_Pybind11_import) ++ ++_Pybind11_import() +diff --git a/infra/cmake/packages/Pybind11SourceConfig.cmake b/infra/cmake/packages/Pybind11SourceConfig.cmake +new file mode 100644 +index 0000000..4a9c676 +--- /dev/null ++++ b/infra/cmake/packages/Pybind11SourceConfig.cmake +@@ -0,0 +1,18 @@ ++function(_Pybind11Source_import) ++ if(NOT DOWNLOAD_PYBIND11) ++ set(Pybind11Source_FOUND FALSE PARENT_SCOPE) ++ return() ++ endif(NOT DOWNLOAD_PYBIND11) ++ ++ nnas_include(ExternalSourceTools) ++ nnas_include(OptionTools) ++ ++ envoption(PYBIND11_URL https://github.com/pybind/pybind11/archive/v2.3.0.tar.gz) ++ ++ ExternalSource_Download(PYBIND11 ${PYBIND11_URL}) ++ ++ set(Pybind11Source_DIR ${PYBIND11_SOURCE_DIR} PARENT_SCOPE) ++ set(Pybind11Source_FOUND TRUE PARENT_SCOPE) ++endfunction(_Pybind11Source_import) ++ ++_Pybind11Source_import() +diff --git a/infra/docker/Dockerfile b/infra/docker/Dockerfile +index e675b53..052cc4f 100644 +--- a/infra/docker/Dockerfile ++++ b/infra/docker/Dockerfile +@@ -1,8 +1,6 @@ + FROM ubuntu:16.04 + + ARG UBUNTU_MIRROR +-ENV http_proxy $http_proxy +-ENV https_proxy $https_proxy + + RUN if [ -n "$http_proxy" ] ; then echo "Acquire::http::proxy \"${http_proxy}\";" >> /etc/apt/apt.conf ; fi + RUN if [ -n "$https_proxy" ] ; then echo "Acquire::https::proxy \"${https_proxy}\";" >> /etc/apt/apt.conf ; fi +@@ -22,6 +20,7 @@ RUN apt-get update && apt-get -qqy install libprotobuf-dev protobuf-compiler + + # Additonal tools + RUN apt-get update && apt-get -qqy install doxygen graphviz wget unzip clang-format-3.9 python3 python3-pip python3-venv hdf5-tools pylint ++RUN pip3 install --upgrade pip + RUN pip3 install yapf==0.22.0 numpy + + # Install google test (source) +diff --git a/infra/docker/Dockerfile.1804 b/infra/docker/Dockerfile.1804 +index fc6fc9a..cc31bba 100644 +--- a/infra/docker/Dockerfile.1804 ++++ b/infra/docker/Dockerfile.1804 +@@ -1,12 +1,6 @@ + FROM ubuntu:18.04 + + ARG UBUNTU_MIRROR +-ENV http_proxy $http_proxy +-ENV https_proxy $https_proxy +- +-RUN if [ -n "$http_proxy" ] ; then echo "Acquire::http::proxy \"${http_proxy}\";" >> /etc/apt/apt.conf ; fi +-RUN if [ -n "$https_proxy" ] ; then echo "Acquire::https::proxy \"${https_proxy}\";" >> /etc/apt/apt.conf ; fi +-RUN if [ -n "$UBUNTU_MIRROR" ] ; then sed "s/archive.ubuntu.com/${UBUNTU_MIRROR}/g" -i /etc/apt/sources.list ; fi + + # Install 'add-apt-repository' + RUN apt-get update && apt-get -qqy install software-properties-common +@@ -22,6 +16,7 @@ RUN apt-get update && apt-get -qqy install libprotobuf-dev protobuf-compiler + + # Additonal tools + RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -qqy install doxygen graphviz wget unzip clang-format-3.9 python3 python3-pip python3-venv hdf5-tools pylint ++RUN pip3 install --upgrade pip + RUN pip3 install yapf==0.22.0 numpy + + # Install google test (source) +diff --git a/infra/nncc/CMakeLists.txt b/infra/nncc/CMakeLists.txt +index 3ac6680..0be6885 100644 +--- a/infra/nncc/CMakeLists.txt ++++ b/infra/nncc/CMakeLists.txt +@@ -98,6 +98,7 @@ option(DOWNLOAD_CAFFE "Download Caffe source" ON) + option(DOWNLOAD_PYTORCH "Download Pytorch source" ON) + option(DOWNLOAD_ONNX "Download ONNX source" ON) + option(DOWNLOAD_ABSEIL "Download Abseil-cpp source" ON) ++option(DOWNLOAD_PYBIND11 "Download Pybind11 source" ON) + + option(DOWNLOAD_GTEST "Download Google Test source" ON) + option(BUILD_GTEST "Build Google Test from the downloaded source" ON) +diff --git a/infra/nncc/command/utcount b/infra/nncc/command/utcount +index d4610e3..d06c5c9 100644 +--- a/infra/nncc/command/utcount ++++ b/infra/nncc/command/utcount +@@ -13,7 +13,7 @@ BUILD_ITEMS="angkor cwrap pepper-str pepper-strcast pp stdex \ + oops pepper-assert \ + hermes hermes-std \ + loco locop locomotiv logo-core logo \ +-foder souschef arser \ ++foder souschef arser vconone \ + safemain mio-circle mio-tflite \ + tflite2circle \ + luci \ +diff --git a/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt b/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt +index 8e7f78e..2442a2d 100644 +--- a/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt ++++ b/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt +@@ -100,7 +100,7 @@ target_include_directories(tensorflow-lite-2.2.0 SYSTEM PUBLIC ${TFLITE_INCLUDES + target_compile_definitions(tensorflow-lite-2.2.0 PUBLIC "GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK -DTFLITE_WITH_RUY -DTFLITE_WITH_RUY_GEMV") + set_property(TARGET tensorflow-lite-2.2.0 PROPERTY POSITION_INDEPENDENT_CODE ON) + target_link_libraries(tensorflow-lite-2.2.0 eigen ${LIB_PTHREAD} dl) +-if(${BUILD_WITH_NNAPI}) ++if(NOT ANDROID AND ${BUILD_WITH_NNAPI}) + target_link_libraries(tensorflow-lite-2.2.0 rt) + endif() + +diff --git a/infra/nnfw/config/gbs.conf b/infra/nnfw/config/gbs.conf +index 515cada..bad9eb2 100644 +--- a/infra/nnfw/config/gbs.conf ++++ b/infra/nnfw/config/gbs.conf +@@ -5,7 +5,7 @@ profile = profile.tizen + [profile.tizen] + user=obs_viewer + obs = obs.tizen +-repos = repo.tizen_base,repo.tizen_mobile ++repos = repo.tizen_one,repo.tizen_base,repo.tizen_mobile + buildroot = /home/GBS-ROOT/ + + [obs.tizen] +@@ -15,6 +15,8 @@ url = http://api.tizen.org + url = http://download.tizen.org/snapshots/tizen/unified/latest/repos/standard/packages/ + + [repo.tizen_base] +-url = http://download.tizen.org/snapshots/tizen/base/latest/repos/standard/packages/ ++url = http://download.tizen.org/snapshots/tizen/base/latest/repos/standard/packages/ + ++[repo.tizen_one] ++url = http://nnfw.mooo.com/archive/tizen/ + +diff --git a/infra/packaging/preset/20200630 b/infra/packaging/preset/20200630 +index e159935..c3ca4b6 100644 +--- a/infra/packaging/preset/20200630 ++++ b/infra/packaging/preset/20200630 +@@ -14,6 +14,7 @@ function preset_configure() + REQUIRED_UNITS+=("souschef") + REQUIRED_UNITS+=("safemain") + REQUIRED_UNITS+=("arser") ++ REQUIRED_UNITS+=("vconone") + # Hermes Logging Framework + REQUIRED_UNITS+=("hermes" "hermes-std") + # loco IR and related utilities +@@ -28,11 +29,14 @@ function preset_configure() + REQUIRED_UNITS+=("record-minmax" "circle-quantizer") + REQUIRED_UNITS+=("one-cmds") + ++ NPROC=${NPROC:-$(cat /proc/cpuinfo | grep -c processor)} ++ + # TODO Use "nncc configure" and "nncc build" + cmake \ + -DCMAKE_INSTALL_PREFIX="${NNCC_INSTALL_PREFIX}" \ + -DCMAKE_BUILD_TYPE=release \ + -DBUILD_WHITELIST=$(join_by ";" "${REQUIRED_UNITS[@]}") \ ++ -DEXTERNALS_BUILD_THREADS=$((NPROC/2)) \ + ${EXTRA_OPTIONS[@]} \ + "${NNAS_PROJECT_PATH}/infra/nncc" + } +@@ -44,14 +48,4 @@ function preset_install() + + # Install tf2nnpkg + install -T -m 755 -D "${SCRIPT_PATH}/res/tf2nnpkg.${PRESET}" "${NNAS_INSTALL_PREFIX}/bin/tf2nnpkg" +- +- # Create python virtual enviornment +- python3 -m venv "${NNAS_INSTALL_PREFIX}/bin/venv" +- +- # Install tensorflow +- source "${NNAS_INSTALL_PREFIX}/bin/venv/bin/activate" +- python -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \ +- install -U pip setuptools +- python -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \ +- install tensorflow-cpu==2.3.0rc0 + } +diff --git a/infra/packaging/res/tf2nnpkg.20200630 b/infra/packaging/res/tf2nnpkg.20200630 +index 9101f82..7846fd3 100644 +--- a/infra/packaging/res/tf2nnpkg.20200630 ++++ b/infra/packaging/res/tf2nnpkg.20200630 +@@ -14,10 +14,16 @@ command_exists() { + usage() + { + echo "Convert TensorFlow model to nnpackage." +- echo "Usage: tf2nnpkg --info --graphdef [OPTION] -o " +- exit 0 ++ echo "Usage: tf2nnpkg" ++ echo " --info " ++ echo " --graphdef " ++ echo " -o " ++ echo " --v2 (optional) Use TF 2.x interface" ++ exit 255 + } + ++TF_INTERFACE="--v1" ++ + # Parse command-line arguments + # + while [ "$#" -ne 0 ]; do +@@ -39,6 +45,10 @@ while [ "$#" -ne 0 ]; do + export OUTPUT_DIR="$2" + shift 2 + ;; ++ '--v2') ++ TF_INTERFACE="--v2" ++ shift ++ ;; + *) + echo "${CUR}" + shift +@@ -83,10 +93,7 @@ OUTPUT=$(awk -F, '/^output/ { print $2 }' ${INFO_FILE} | cut -d: -f1 | tr -d ' ' + INPUT_SHAPES=$(grep ^input ${INFO_FILE} | cut -d "[" -f2 | cut -d "]" -f1 | tr -d ' ' | xargs | tr ' ' ':') + + # generate tflite file +-python "${ROOT}/bin/tf2tfliteV2.py" --v2 --input_path ${GRAPHDEF_FILE} \ +---output_path "${TMPDIR}/${MODEL_NAME}.tflite" \ +---input_arrays ${INPUT} --output_arrays ${OUTPUT} || \ +-python "${ROOT}/bin/tf2tfliteV2.py" --v1 --input_path ${GRAPHDEF_FILE} \ ++python "${ROOT}/bin/tf2tfliteV2.py" ${TF_INTERFACE} --input_path ${GRAPHDEF_FILE} \ + --output_path "${TMPDIR}/${MODEL_NAME}.tflite" \ + --input_arrays ${INPUT} --input_shapes ${INPUT_SHAPES} \ + --output_arrays ${OUTPUT} +diff --git a/infra/scripts/build-tcm.sh b/infra/scripts/build-tcm.sh +new file mode 100755 +index 0000000..22fb335 +--- /dev/null ++++ b/infra/scripts/build-tcm.sh +@@ -0,0 +1,24 @@ ++#!/bin/bash ++# ++# STEP 1 ++# Download latest TCM tool from ++# https://github.sec.samsung.net/RS-TCM/tca-standalone/releases/download/v0.0.8/tca-standalone-0.0.8.jar ++# ++# STEP 2 ++# Create symbolic link `./src` for source directory to be analyzed which has `.ahub` configuration. ++# ++# STEP 3 ++# run this `build-tcm.sh` script. ++# ++# See the following link for additional details. ++# https://github.sec.samsung.net/RS-TCM/tca-standalone/wiki/Tutorials-CPP-Gtest ++# ++ ++echo ${PROJECT_DIR:=${PWD}} ++ ++java -jar $PROJECT_DIR/tca-standalone-0.0.8.jar \ ++ --outdir=$PROJECT_DIR/tcm-output \ ++ --config=$PROJECT_DIR/.ahub/tcchecker-tca/config.yaml \ ++ --local=$PROJECT_DIR/src \ ++ --logfile=$PROJECT_DIR/tcm-output/tcm.log \ ++ --debug +diff --git a/infra/scripts/compiler_modules.sh b/infra/scripts/compiler_modules.sh +index d436e8a..a0323e0 100644 +--- a/infra/scripts/compiler_modules.sh ++++ b/infra/scripts/compiler_modules.sh +@@ -7,7 +7,7 @@ DEBUG_BUILD_ITEMS="angkor;cwrap;pepper-str;pepper-strcast;pp;stdex" + DEBUG_BUILD_ITEMS+=";oops;pepper-assert" + DEBUG_BUILD_ITEMS+=";hermes;hermes-std" + DEBUG_BUILD_ITEMS+=";loco;locop;locomotiv;logo-core;logo" +-DEBUG_BUILD_ITEMS+=";foder;souschef;arser" ++DEBUG_BUILD_ITEMS+=";foder;souschef;arser;vconone" + DEBUG_BUILD_ITEMS+=";safemain;mio-circle;mio-tflite" + DEBUG_BUILD_ITEMS+=";tflite2circle" + DEBUG_BUILD_ITEMS+=";luci" +diff --git a/infra/scripts/docker_build_cross_aarch64_runtime.sh b/infra/scripts/docker_build_cross_aarch64_runtime.sh +index 7da6736..011d14c 100755 +--- a/infra/scripts/docker_build_cross_aarch64_runtime.sh ++++ b/infra/scripts/docker_build_cross_aarch64_runtime.sh +@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + ROOT_PATH="$CURRENT_PATH/../../" + + # prepare rootfs +-if [ ! -d $ROOTFS_DIR ]; then ++if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then + echo "It will use default rootfs path" + else + DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs" +diff --git a/infra/scripts/docker_build_cross_arm_runtime.sh b/infra/scripts/docker_build_cross_arm_runtime.sh +index f1f666a..551fb57 100755 +--- a/infra/scripts/docker_build_cross_arm_runtime.sh ++++ b/infra/scripts/docker_build_cross_arm_runtime.sh +@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + ROOT_PATH="$CURRENT_PATH/../../" + + # prepare rootfs +-if [ ! -d $ROOTFS_DIR ]; then ++if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then + echo "It will use default rootfs path" + else + DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs" +diff --git a/infra/scripts/docker_build_cross_arm_runtime_release.sh b/infra/scripts/docker_build_cross_arm_runtime_release.sh +index ea66f17..876f318 100755 +--- a/infra/scripts/docker_build_cross_arm_runtime_release.sh ++++ b/infra/scripts/docker_build_cross_arm_runtime_release.sh +@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + ROOT_PATH="$CURRENT_PATH/../../" + + # prepare rootfs +-if [ ! -d $ROOTFS_DIR ]; then ++if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then + echo "It will use default rootfs path" + else + DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs" +diff --git a/infra/scripts/docker_build_cross_coverage.sh b/infra/scripts/docker_build_cross_coverage.sh +index 08244e5..f42251b 100755 +--- a/infra/scripts/docker_build_cross_coverage.sh ++++ b/infra/scripts/docker_build_cross_coverage.sh +@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + ROOT_PATH="$CURRENT_PATH/../../" + + # prepare rootfs +-if [ ! -d $ROOTFS_DIR ]; then ++if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then + echo "It will use default rootfs path" + else + DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs" +diff --git a/infra/scripts/docker_build_nncc.sh b/infra/scripts/docker_build_nncc.sh +index 418b50d..5b12531 100755 +--- a/infra/scripts/docker_build_nncc.sh ++++ b/infra/scripts/docker_build_nncc.sh +@@ -54,6 +54,16 @@ pushd $ROOT_PATH > /dev/null + mkdir -p ${NNCC_INSTALL_PREFIX} + ./nncc docker-run ./nnas create-package --prefix "${PWD}/${NNCC_INSTALL_PREFIX}" -- "${CONFIG_OPTIONS}" + ++# create python virtual environment ++./nncc docker-run python3 -m venv "${NNCC_INSTALL_PREFIX}/bin/venv" ++ ++./nncc docker-run "${NNCC_INSTALL_PREFIX}/bin/venv/bin/python" \ ++ -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \ ++ install -U pip setuptools ++./nncc docker-run "${NNCC_INSTALL_PREFIX}/bin/venv/bin/python" \ ++ -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \ ++ install tensorflow-cpu==2.3.0rc0 ++ + mkdir -p ${ARCHIVE_PATH} + tar -zcf ${ARCHIVE_PATH}/nncc-package.tar.gz -C ${NNCC_INSTALL_PREFIX} ./ + +diff --git a/infra/scripts/docker_build_tizen_cross.sh b/infra/scripts/docker_build_tizen_cross.sh +index 18809ad..ee0f183 100755 +--- a/infra/scripts/docker_build_tizen_cross.sh ++++ b/infra/scripts/docker_build_tizen_cross.sh +@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + ROOT_PATH="$CURRENT_PATH/../../" + + # prepare rootfs +-if [ ! -d $ROOTFS_DIR ]; then ++if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then + echo "It will use default rootfs path" + else + DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs" +diff --git a/infra/scripts/docker_collect_nnpkg_resources.sh b/infra/scripts/docker_collect_nnpkg_resources.sh +index 556c5bd..55adaa1 100755 +--- a/infra/scripts/docker_collect_nnpkg_resources.sh ++++ b/infra/scripts/docker_collect_nnpkg_resources.sh +@@ -60,7 +60,7 @@ pushd $ROOT_PATH > /dev/null + REQUIRED_UNITS=() + # Common Libraries + REQUIRED_UNITS+=("angkor" "cwrap" "pepper-str" "pepper-strcast" "pp" "stdex") +-REQUIRED_UNITS+=("oops" "safemain" "foder" "arser" "oops") ++REQUIRED_UNITS+=("oops" "safemain" "foder" "arser" "vconone") + # Hermes Logging Framework + REQUIRED_UNITS+=("hermes" "hermes-std") + # loco IR and related utilities +diff --git a/infra/scripts/tizen_xu4_test.sh b/infra/scripts/tizen_xu4_test.sh +index 5521b5f..640a0e0 100755 +--- a/infra/scripts/tizen_xu4_test.sh ++++ b/infra/scripts/tizen_xu4_test.sh +@@ -23,7 +23,7 @@ function install_model() + { + # download tflite model files + pushd $HOST_HOME +- tests/scripts/framework/run_test.sh --download=on ++ tests/scripts/framework/run_test.sh --download=on --run=off + # TODO Since this command removes model file(.zip), + # We must always download the file unlike model file(.tflite). + # Because caching applies only to tflite file. +diff --git a/packaging/nnfw.spec b/packaging/nnfw.spec +index ce1cd0b..e26ffcb 100644 +--- a/packaging/nnfw.spec ++++ b/packaging/nnfw.spec +@@ -30,7 +30,7 @@ BuildRequires: flatbuffers-devel + %ifarch %{arm} aarch64 + # Require python for acl-ex library build pre-process + BuildRequires: python +-BuildRequires: libarmcl-devel ++BuildRequires: libarmcl-devel >= v20.05 + %endif + + Requires(post): /sbin/ldconfig +diff --git a/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.recipe b/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.recipe +new file mode 100644 +index 0000000..7322e90 +--- /dev/null ++++ b/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.recipe +@@ -0,0 +1,26 @@ ++operand { ++ name: "ifm" ++ type: UINT8 ++ shape { dim: 1 dim: 8 dim: 8 dim: 1 } ++ quant { min: 0 max: 255 scale: 1.0 zero_point: 0 } ++} ++operand { ++ name: "ofm" ++ type: UINT8 ++ shape { dim: 1 dim: 7 dim: 7 dim: 1 } ++ quant { min: 0 max: 255 scale: 1.0 zero_point: 0 } ++} ++operation { ++ type: "AveragePool2D" ++ averagepool2d_options { ++ padding: VALID ++ stride_w: 1 ++ stride_h: 1 ++ filter_width: 2 ++ filter_height: 2 ++ } ++ input: "ifm" ++ output: "ofm" ++} ++input: "ifm" ++output: "ofm" +diff --git a/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.reverse b/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.reverse +new file mode 100644 +index 0000000..e69de29 +diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.recipe b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.recipe +new file mode 100644 +index 0000000..a09afc1 +--- /dev/null ++++ b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.recipe +@@ -0,0 +1,44 @@ ++operand { ++ name: "ifm" ++ type: FLOAT32 ++ shape { dim: 1 dim: 4 dim: 5 dim: 5 } ++} ++operand { ++ name: "ker" ++ type: FLOAT32 ++ shape { dim: 1 dim: 1 dim: 2 dim: 25 } ++} ++operand { ++ name: "bias" ++ type: FLOAT32 ++ shape { dim: 25 } ++ filler { ++ tag: "constant" ++ arg: "1.1" ++ } ++} ++operand { ++ name: "ofm" ++ type: FLOAT32 ++ shape { dim: 1 dim: 2 dim: 2 dim: 25 } ++} ++operation { ++ type: "DepthwiseConv2D" ++ version: 2 ++ depthwiseconv2d_options { ++ padding: VALID ++ stride_w: 2 ++ stride_h: 2 ++ dilation_w_factor: 2 ++ dilation_h_factor: 1 ++ depth_multiplier: 5 ++ activation : RELU6 ++ } ++ input: "ifm" ++ input: "ker" ++ input: "bias" ++ output: "ofm" ++} ++input: "ifm" ++input: "ker" ++output: "ofm" +diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.reverse b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.reverse +new file mode 100644 +index 0000000..e69de29 +diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.rule b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.rule +new file mode 100644 +index 0000000..edfabc6 +--- /dev/null ++++ b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.rule +@@ -0,0 +1,3 @@ ++# To check if DEPTHWISE_CONV_2D version is 2 ++ ++RULE "OP_VERSION_CHECK" $(op_version DEPTHWISE_CONV_2D) '=' 2 +diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.recipe b/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.recipe +new file mode 100644 +index 0000000..5e0b6b5 +--- /dev/null ++++ b/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.recipe +@@ -0,0 +1,61 @@ ++operand { ++ name: "ifm" ++ type: UINT8 ++ shape { dim: 1 dim: 112 dim: 112 dim: 4 } ++ quant { min: 0 max: 6 scale: 0.0235294 zero_point: 0 } ++} ++operand { ++ name: "ker" ++ type: UINT8 ++ shape { dim: 1 dim: 3 dim: 3 dim: 4 } ++ filler { ++ tag: "gaussian" ++ arg: "0.0" ++ arg: "1.0" ++ } ++ quant { ++ min: -30.3175 min: -0.779597 min: -10.2751 min: -10.8594 ++ max: 4.35049 max: 2.70807 max: 11.0269 max: 20.97 ++ scale:0.135953 scale: 0.0136771 scale: 0.0835375 scale: 0.124821 ++ zero_point:223 zero_point: 57 zero_point: 123 zero_point: 87 ++ quantized_dimension: 3 ++ } ++} ++operand { ++ name: "bias" ++ type: INT32 ++ shape { dim: 4 } ++ filler { ++ tag: "gaussian" ++ arg: "0" ++ arg: "1.0" ++ } ++ quant { ++ scale: 1.4758e-16 scale: 3.15185e-05 scale: 2.20685e-05 scale: 1.72205e-16 ++ zero_point: 0 zero_point: 0 zero_point: 0 zero_point: 0 ++ } ++} ++operand { ++ name: "ofm" ++ type: UINT8 ++ shape { dim: 1 dim: 112 dim: 112 dim: 4 } ++ quant { min: 0 max: 6 scale: 0.0235294 zero_point: 0 } ++ ++} ++operation { ++ type: "DepthwiseConv2D" ++ depthwiseconv2d_options { ++ padding: SAME ++ stride_w: 1 ++ stride_h: 1 ++ depth_multiplier: 1 ++ activation : RELU6 ++ } ++ input: "ifm" ++ input: "ker" ++ input: "bias" ++ output: "ofm" ++} ++input: "ifm" ++input: "ker" ++output: "ofm" +diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.reverse b/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.reverse +new file mode 100644 +index 0000000..e69de29 +diff --git a/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.recipe b/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.recipe +new file mode 100644 +index 0000000..3fff5cd +--- /dev/null ++++ b/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.recipe +@@ -0,0 +1,22 @@ ++operand { ++ name: "ifm1" ++ type: UINT8 ++ shape { dim: 1 dim: 4 dim: 4 dim: 3 } ++ quant { min: 0 max: 2 scale: 0.0078125 zero_point: 128} ++} ++operand { ++ name: "ofm" ++ type: UINT8 ++ shape { dim: 1 dim: 4 dim: 4 dim: 3 } ++ quant { min: 0 max: 2 scale: 0.0078125 zero_point: 128} ++} ++operation { ++ type: "L2Normalize" ++ l2norm_options { ++ activation: NONE ++ } ++ input: "ifm1" ++ output: "ofm" ++} ++input: "ifm1" ++output: "ofm" +diff --git a/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.reverse b/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.reverse +new file mode 100644 +index 0000000..e69de29 +diff --git a/res/TensorFlowLiteRecipes/Logistic_U8_000/test.recipe b/res/TensorFlowLiteRecipes/Logistic_U8_000/test.recipe +new file mode 100644 +index 0000000..7b2a84d +--- /dev/null ++++ b/res/TensorFlowLiteRecipes/Logistic_U8_000/test.recipe +@@ -0,0 +1,19 @@ ++operand { ++ name: "ifm" ++ type: UINT8 ++ shape { dim: 1 dim: 3 dim: 3 dim: 2 } ++ quant { min: 0 max: 1 scale: 0.00390625 zero_point: -128 } ++} ++operand { ++ name: "ofm" ++ type: UINT8 ++ shape { dim: 1 dim: 3 dim: 3 dim: 2 } ++ quant { min: 0 max: 1 scale: 0.00390625 zero_point: -128 } ++} ++operation { ++ type: "Logistic" ++ input: "ifm" ++ output: "ofm" ++} ++input: "ifm" ++output: "ofm" +diff --git a/res/TensorFlowLiteRecipes/Logistic_U8_000/test.reverse b/res/TensorFlowLiteRecipes/Logistic_U8_000/test.reverse +new file mode 100644 +index 0000000..e69de29 +diff --git a/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe b/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe +index 79271a4..1313e26 100644 +--- a/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe ++++ b/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe +@@ -10,7 +10,7 @@ operand { + operand { + name: "ker" + type: FLOAT32 +- shape { dim: 1 dim: 3 dim: 3 dim: 1 } ++ shape { dim: 3 dim: 1 dim: 1 dim: 3 } + filler { + tag: "gaussian" + arg: "0.0" +diff --git a/res/TensorFlowLiteRecipes/Unique_000/test.recipe b/res/TensorFlowLiteRecipes/Unique_000/test.recipe +new file mode 100644 +index 0000000..887380c +--- /dev/null ++++ b/res/TensorFlowLiteRecipes/Unique_000/test.recipe +@@ -0,0 +1,27 @@ ++operand { ++ name: "ifm" ++ type: FLOAT32 ++ shape { dim: 4 } ++} ++operand { ++ name: "ofm" ++ type: FLOAT32 ++ shape { } ++} ++operand { ++ name: "ofm_idx" ++ type: INT32 ++ shape { dim: 4 } ++} ++operation { ++ type: "Unique" ++ unique_options { ++ idx_out_type: INT32 ++ } ++ input: "ifm" ++ output: "ofm" ++ output: "ofm_idx" ++} ++input: "ifm" ++output: "ofm" ++output: "ofm_idx" +diff --git a/res/TensorFlowLiteRecipes/Unique_000/test.reverse b/res/TensorFlowLiteRecipes/Unique_000/test.reverse +new file mode 100644 +index 0000000..e69de29 +diff --git a/res/TensorFlowLiteRecipes/Unique_001/test.recipe b/res/TensorFlowLiteRecipes/Unique_001/test.recipe +new file mode 100644 +index 0000000..9beb516 +--- /dev/null ++++ b/res/TensorFlowLiteRecipes/Unique_001/test.recipe +@@ -0,0 +1,27 @@ ++operand { ++ name: "ifm" ++ type: FLOAT32 ++ shape { dim: 4 } ++} ++operand { ++ name: "ofm" ++ type: FLOAT32 ++ shape { } ++} ++operand { ++ name: "ofm_idx" ++ type: INT64 ++ shape { dim: 4 } ++} ++operation { ++ type: "Unique" ++ unique_options { ++ idx_out_type: INT64 ++ } ++ input: "ifm" ++ output: "ofm" ++ output: "ofm_idx" ++} ++input: "ifm" ++output: "ofm" ++output: "ofm_idx" +diff --git a/res/TensorFlowLiteRecipes/Unique_001/test.reverse b/res/TensorFlowLiteRecipes/Unique_001/test.reverse +new file mode 100644 +index 0000000..e69de29 +diff --git a/res/TensorFlowLiteRecipes/Unique_002/test.recipe b/res/TensorFlowLiteRecipes/Unique_002/test.recipe +new file mode 100644 +index 0000000..67b947f +--- /dev/null ++++ b/res/TensorFlowLiteRecipes/Unique_002/test.recipe +@@ -0,0 +1,27 @@ ++operand { ++ name: "ifm" ++ type: INT32 ++ shape { dim: 5 } ++} ++operand { ++ name: "ofm" ++ type: INT32 ++ shape { } ++} ++operand { ++ name: "ofm_idx" ++ type: INT32 ++ shape { dim: 5 } ++} ++operation { ++ type: "Unique" ++ unique_options { ++ idx_out_type: INT32 ++ } ++ input: "ifm" ++ output: "ofm" ++ output: "ofm_idx" ++} ++input: "ifm" ++output: "ofm" ++output: "ofm_idx" +diff --git a/res/TensorFlowLiteRecipes/Unique_002/test.reverse b/res/TensorFlowLiteRecipes/Unique_002/test.reverse +new file mode 100644 +index 0000000..e69de29 +diff --git a/res/TensorFlowLiteRecipes/Unique_003/test.recipe b/res/TensorFlowLiteRecipes/Unique_003/test.recipe +new file mode 100644 +index 0000000..375db66 +--- /dev/null ++++ b/res/TensorFlowLiteRecipes/Unique_003/test.recipe +@@ -0,0 +1,27 @@ ++operand { ++ name: "ifm" ++ type: INT32 ++ shape { dim: 5 } ++} ++operand { ++ name: "ofm" ++ type: INT32 ++ shape { } ++} ++operand { ++ name: "ofm_idx" ++ type: INT64 ++ shape { dim: 5 } ++} ++operation { ++ type: "Unique" ++ unique_options { ++ idx_out_type: INT64 ++ } ++ input: "ifm" ++ output: "ofm" ++ output: "ofm_idx" ++} ++input: "ifm" ++output: "ofm" ++output: "ofm_idx" +diff --git a/res/TensorFlowLiteRecipes/Unique_003/test.reverse b/res/TensorFlowLiteRecipes/Unique_003/test.reverse +new file mode 100644 +index 0000000..e69de29 +diff --git a/res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe b/res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe +new file mode 100644 +index 0000000..d3985e4 +--- /dev/null ++++ b/res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe +@@ -0,0 +1,28 @@ ++operand { ++ name: "ifm" ++ type: UINT8 ++ shape { dim: 4 } ++ quant { min: 0 max: 255 scale: 1.0 zero_point: 0 } ++} ++operand { ++ name: "ofm" ++ type: UINT8 ++ shape { } ++} ++operand { ++ name: "ofm_idx" ++ type: INT32 ++ shape { dim: 4 } ++} ++operation { ++ type: "Unique" ++ unique_options { ++ idx_out_type: INT32 ++ } ++ input: "ifm" ++ output: "ofm" ++ output: "ofm_idx" ++} ++input: "ifm" ++output: "ofm" ++output: "ofm_idx" +diff --git a/res/TensorFlowLiteRecipes/Unique_U8_000/test.reverse b/res/TensorFlowLiteRecipes/Unique_U8_000/test.reverse +new file mode 100644 +index 0000000..e69de29 +diff --git a/res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe b/res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe +new file mode 100644 +index 0000000..b08dd85 +--- /dev/null ++++ b/res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe +@@ -0,0 +1,28 @@ ++operand { ++ name: "ifm" ++ type: UINT8 ++ shape { dim: 5 } ++ quant { min: 0 max: 255 scale: 1.0 zero_point: 0 } ++} ++operand { ++ name: "ofm" ++ type: UINT8 ++ shape { } ++} ++operand { ++ name: "ofm_idx" ++ type: INT64 ++ shape { dim: 5 } ++} ++operation { ++ type: "Unique" ++ unique_options { ++ idx_out_type: INT64 ++ } ++ input: "ifm" ++ output: "ofm" ++ output: "ofm_idx" ++} ++input: "ifm" ++output: "ofm" ++output: "ofm_idx" +diff --git a/res/TensorFlowLiteRecipes/Unique_U8_001/test.reverse b/res/TensorFlowLiteRecipes/Unique_U8_001/test.reverse +new file mode 100644 +index 0000000..e69de29 +diff --git a/runtime/libs/benchmark/CMakeLists.txt b/runtime/libs/benchmark/CMakeLists.txt +index 2af0ffa..748b2d1 100644 +--- a/runtime/libs/benchmark/CMakeLists.txt ++++ b/runtime/libs/benchmark/CMakeLists.txt +@@ -1,6 +1,5 @@ + file(GLOB_RECURSE SOURCES "src/*.cpp") + +-add_library(nnfw_lib_benchmark SHARED ${SOURCES}) ++add_library(nnfw_lib_benchmark STATIC ${SOURCES}) + target_include_directories(nnfw_lib_benchmark PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include) + target_link_libraries(nnfw_lib_benchmark PRIVATE ${LIB_PTHREAD}) +-install(TARGETS nnfw_lib_benchmark DESTINATION lib) +diff --git a/runtime/libs/benchmark/src/Result.cpp b/runtime/libs/benchmark/src/Result.cpp +index 7a3f9a5..df573da 100644 +--- a/runtime/libs/benchmark/src/Result.cpp ++++ b/runtime/libs/benchmark/src/Result.cpp +@@ -166,7 +166,7 @@ Result::Result(const Phases &phases) + if (option.memory) + { + print_memory = true; +- for (int i = PhaseEnum::MODEL_LOAD; i <= PhaseEnum::EXECUTE; ++i) ++ for (int i = PhaseEnum::MODEL_LOAD; i < PhaseEnum::EXECUTE; ++i) + { + auto phase = phases.at(gPhaseStrings[i]); + for (int j = MemoryType::RSS; j <= MemoryType::PSS; ++j) +diff --git a/runtime/onert/api/include/nnfw.h b/runtime/onert/api/include/nnfw.h +index 031aabd..03a3aed 100644 +--- a/runtime/onert/api/include/nnfw.h ++++ b/runtime/onert/api/include/nnfw.h +@@ -99,6 +99,8 @@ typedef enum { + NNFW_STATUS_ERROR = 1, + /** Unexpected null argument is given. */ + NNFW_STATUS_UNEXPECTED_NULL = 2, ++ /** When a function was called but it is not valid for the current session state. */ ++ NNFW_STATUS_INVALID_STATE = 3, + } NNFW_STATUS; + + /** +@@ -432,10 +434,10 @@ NNFW_STATUS nnfw_output_tensorinfo(nnfw_session *session, uint32_t index, + * + *

Supported backends differs on each platforms. + * For example, `x86_64` supports "cpu" only. +- * Can set multiple backends by semicolon (ex: "acl_cl;cpu"). +- * Among the multiple backends, the 1st element is used as default backend.

+- * +- * @note Possible backend strings are: "cpu", "acl_cl", "acl_neon", "srcn" ++ * Multiple backends can be set and they must be separated by a semicolon (ex: "acl_cl;cpu"). ++ * For each backend string, `libbackend_{backend}.so` will be dynamically loaded during ++ * {@link nnfw_prepare}. ++ * Among the multiple backends, the 1st element is used as the default backend.

+ * + * @param[in] session session to which avilable backends are set + * @param[in] backends available backends on which nnfw uses +@@ -449,12 +451,10 @@ NNFW_STATUS nnfw_set_available_backends(nnfw_session *session, const char *backe + * + * This function should be called before {@link nnfw_prepare} is invoked. + * +- *

Supported backends differs on each platforms. +- * For example, `x86_64` supports "cpu" only. +- * The backend for op has higher priority than available backends specified by +- * nnfw_set_available_backends.

++ *

The backend for op has higher priority than available backends specified by ++ * {@link nnfw_set_available_backends}.

+ * +- * @note Possible backend strings are: "cpu", "acl_cl", "acl_neon" ++ * @deprecated Deprecated since 1.8.0. + * + * @param[in] session session to be modified + * @param[in] op operation to be set +diff --git a/runtime/onert/api/src/nnfw_api.cc b/runtime/onert/api/src/nnfw_api.cc +index 0747583..34a46ed 100644 +--- a/runtime/onert/api/src/nnfw_api.cc ++++ b/runtime/onert/api/src/nnfw_api.cc +@@ -31,6 +31,7 @@ STATIC_ASSERT_ENUM_CHECK(NNFW_TYPE_TENSOR_INT64, 5); + STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_NO_ERROR, 0); + STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_ERROR, 1); + STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_UNEXPECTED_NULL, 2); ++STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_INVALID_STATE, 3); + + STATIC_ASSERT_ENUM_CHECK(NNFW_LAYOUT_NONE, 0); + STATIC_ASSERT_ENUM_CHECK(NNFW_LAYOUT_CHANNELS_LAST, 1); +diff --git a/runtime/onert/api/src/nnfw_api_internal.cc b/runtime/onert/api/src/nnfw_api_internal.cc +index d03ddd4..b3390fa 100644 +--- a/runtime/onert/api/src/nnfw_api_internal.cc ++++ b/runtime/onert/api/src/nnfw_api_internal.cc +@@ -76,7 +76,7 @@ nnfw_session::~nnfw_session() = default; + NNFW_STATUS nnfw_session::load_model_from_file(const char *package_dir) + { + if (!isStateInitialized()) +- return NNFW_STATUS_ERROR; ++ return NNFW_STATUS_INVALID_STATE; + + if (!package_dir) + { +@@ -156,7 +156,7 @@ NNFW_STATUS nnfw_session::prepare() + std::cerr << "invalid state"; + } + std::cerr << std::endl; +- return NNFW_STATUS_ERROR; ++ return NNFW_STATUS_INVALID_STATE; + } + + if (!_subgraphs || !primary_subgraph() || primary_subgraph()->isBuildingPhase()) +@@ -188,7 +188,7 @@ NNFW_STATUS nnfw_session::run() + { + std::cerr << "Error during nnfw_session::run : " + << "run should be run after prepare" << std::endl; +- return NNFW_STATUS_ERROR; ++ return NNFW_STATUS_INVALID_STATE; + } + + try +@@ -211,7 +211,7 @@ NNFW_STATUS nnfw_session::run_async() + { + std::cerr << "Error during nnfw_session::run_async : " + << "run_async should be run after prepare" << std::endl; +- return NNFW_STATUS_ERROR; ++ return NNFW_STATUS_INVALID_STATE; + } + + _execution->startExecute(); +@@ -241,7 +241,7 @@ NNFW_STATUS nnfw_session::set_input(uint32_t index, NNFW_TYPE /*type*/, const vo + if (!isStatePreparedOrFinishedRun()) + { + std::cerr << "Error during nnfw_session::set_input : invalid state" << std::endl; +- return NNFW_STATUS_ERROR; ++ return NNFW_STATUS_INVALID_STATE; + } + + if (!buffer && length != 0) +@@ -270,7 +270,7 @@ NNFW_STATUS nnfw_session::set_output(uint32_t index, NNFW_TYPE /*type*/, void *b + if (!isStatePreparedOrFinishedRun()) + { + std::cerr << "Error during nnfw_session::set_output : invalid state" << std::endl; +- return NNFW_STATUS_ERROR; ++ return NNFW_STATUS_INVALID_STATE; + } + + if (!buffer && length != 0) +@@ -296,7 +296,7 @@ NNFW_STATUS nnfw_session::set_output(uint32_t index, NNFW_TYPE /*type*/, void *b + NNFW_STATUS nnfw_session::input_size(uint32_t *number) + { + if (isStateInitialized()) // Model is not loaded +- return NNFW_STATUS_ERROR; ++ return NNFW_STATUS_INVALID_STATE; + + try + { +@@ -318,7 +318,7 @@ NNFW_STATUS nnfw_session::input_size(uint32_t *number) + NNFW_STATUS nnfw_session::output_size(uint32_t *number) + { + if (isStateInitialized()) // Model is not loaded +- return NNFW_STATUS_ERROR; ++ return NNFW_STATUS_INVALID_STATE; + + try + { +@@ -410,7 +410,7 @@ NNFW_STATUS nnfw_session::apply_tensorinfo(uint32_t index, nnfw_tensorinfo ti) + { + std::cerr << "Error during set_input_tensorinfo : should be run after load_model" + << std::endl; +- return NNFW_STATUS_ERROR; ++ return NNFW_STATUS_INVALID_STATE; + } + + if (ti.rank <= 0 || ti.rank > NNFW_MAX_RANK) +@@ -463,6 +463,9 @@ NNFW_STATUS nnfw_session::set_input_tensorinfo(uint32_t index, const nnfw_tensor + + NNFW_STATUS nnfw_session::input_tensorinfo(uint32_t index, nnfw_tensorinfo *ti) + { ++ if (isStateInitialized()) ++ return NNFW_STATUS_INVALID_STATE; ++ + try + { + if (ti == nullptr) +@@ -499,7 +502,7 @@ NNFW_STATUS nnfw_session::input_tensorinfo(uint32_t index, nnfw_tensorinfo *ti) + NNFW_STATUS nnfw_session::output_tensorinfo(uint32_t index, nnfw_tensorinfo *ti) + { + if (isStateInitialized()) +- return NNFW_STATUS_ERROR; ++ return NNFW_STATUS_INVALID_STATE; + + if (ti == nullptr) + { +@@ -570,7 +573,7 @@ static std::string get_op_backend_string(std::string op) + NNFW_STATUS nnfw_session::set_available_backends(const char *backends) + { + if (!isStateModelLoaded()) +- return NNFW_STATUS_ERROR; ++ return NNFW_STATUS_INVALID_STATE; + + try + { +@@ -596,7 +599,7 @@ NNFW_STATUS nnfw_session::set_available_backends(const char *backends) + NNFW_STATUS nnfw_session::set_op_backend(const char *op, const char *backend) + { + if (!isStateModelLoaded()) +- return NNFW_STATUS_ERROR; ++ return NNFW_STATUS_INVALID_STATE; + + try + { +@@ -627,7 +630,7 @@ NNFW_STATUS nnfw_session::set_op_backend(const char *op, const char *backend) + NNFW_STATUS nnfw_session::set_config(const char *key, const char *value) + { + if (!isStateModelLoaded()) +- return NNFW_STATUS_ERROR; ++ return NNFW_STATUS_INVALID_STATE; + + auto &options = _compiler->options(); + +@@ -693,7 +696,7 @@ onert::ir::Graph *nnfw_session::primary_subgraph() + NNFW_STATUS nnfw_session::get_config(const char *key, char *value, size_t value_size) + { + if (!isStateModelLoaded()) +- return NNFW_STATUS_ERROR; ++ return NNFW_STATUS_INVALID_STATE; + + auto &options = _compiler->options(); + +diff --git a/runtime/onert/backend/acl_cl/KernelGenerator.cc b/runtime/onert/backend/acl_cl/KernelGenerator.cc +index 3ca4058..4ab2d4c 100644 +--- a/runtime/onert/backend/acl_cl/KernelGenerator.cc ++++ b/runtime/onert/backend/acl_cl/KernelGenerator.cc +@@ -31,6 +31,7 @@ + #include "exec/FunctionSequence.h" + #include "util/logging.h" + #include "util/Utils.h" ++#include "AclKernelGen.h" + + namespace onert + { +@@ -76,15 +77,15 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node) + const auto block_size_index{ + node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); +- auto block_size_alloc = _tensor_builder->at(block_size_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); ++ auto block_size_tensor = _tensor_builder->at(block_size_index).get(); + + assert(_ctx.at(block_size_index).data()); + + auto fn = std::make_unique<::arm_compute::CLBatchToSpaceLayer>(); + +- fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), ofm_alloc->handle()); ++ fn->configure(ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle()); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -96,15 +97,27 @@ void KernelGenerator::visit(const ir::operation::Cast &node) + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); +- const auto input_sub_type = _ctx.at(ifm_index).typeInfo().type() == ir::DataType::BOOL8 +- ? arm_compute::SubDataType::BOOL +- : arm_compute::SubDataType::NONE; ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + +- auto fn = std::make_unique<::arm_compute::CLCast>(); ++ std::unique_ptr<::arm_compute::IFunction> fn; ++ if (ifm_tensor->data_type() == ofm_tensor->data_type()) ++ { ++ auto l = std::make_unique<::arm_compute::CLCopy>(); ++ ++ l->configure(ifm_tensor->handle(), ofm_tensor->handle()); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), input_sub_type); ++ fn = std::move(l); ++ } ++ else ++ { ++ auto l = std::make_unique<::arm_compute::CLCast>(); ++ ++ // TODO Support converting float to int32 as round down ++ l->configure(ifm_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE); ++ ++ fn = std::move(l); ++ } + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -132,10 +145,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) + ker_width, ker_height); + const auto activation = node.param().activation; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); +- auto ker_alloc = _tensor_builder->at(ker_index).get(); +- auto bias_alloc = _tensor_builder->at(bias_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); ++ auto ker_tensor = _tensor_builder->at(ker_index).get(); ++ auto bias_tensor = _tensor_builder->at(bias_index).get(); + + const auto conv_info = acl_common::asPadStrideInfo(padding, stride); + const auto act_info = acl_common::asActivationLayerInfo(activation); +@@ -143,8 +156,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) + auto fn = std::make_unique<::arm_compute::CLConvolutionLayer>( + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + +- fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(), +- conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info); ++ fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), ++ ofm_tensor->handle(), conv_info, ::arm_compute::WeightsInfo(), ++ ::arm_compute::Size2D(1U, 1U), act_info); + + _return_fn = asAclClFunction(std::move(fn)); + } +@@ -171,10 +185,10 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) + const auto multiplier = node.param().multiplier; + const auto activation = node.param().activation; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); +- auto ker_alloc = _tensor_builder->at(ker_index).get(); +- auto bias_alloc = _tensor_builder->at(bias_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); ++ auto ker_tensor = _tensor_builder->at(ker_index).get(); ++ auto bias_tensor = _tensor_builder->at(bias_index).get(); + + const auto conv_info = acl_common::asPadStrideInfo(padding, stride); + const auto act_info = acl_common::asActivationLayerInfo(activation); +@@ -182,8 +196,8 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) + { + auto fn = std::make_unique<::arm_compute::CLDepthwiseConvolutionLayer>(); + +- fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), +- ofm_alloc->handle(), conv_info, multiplier, act_info); ++ fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), ++ ofm_tensor->handle(), conv_info, multiplier, act_info); + + _return_fn = asAclClFunction(std::move(fn)); + } +@@ -217,19 +231,20 @@ void KernelGenerator::visit(const ir::operation::MaxPool2D &node) + VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl; + VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + +- ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX, +- ::arm_compute::Size2D{kw, kh}, +- acl_common::asPadStrideInfo(padding, stride)}; ++ ::arm_compute::PoolingLayerInfo info{ ++ ::arm_compute::PoolingType::MAX, ::arm_compute::Size2D{kw, kh}, ++ ifm_tensor->info()->data_layout(), acl_common::asPadStrideInfo(padding, stride)}; + + auto fn = std::make_unique<::arm_compute::CLPoolingLayer>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info); + + _return_fn = std::make_unique( +- asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); ++ asAclClFunction(std::move(fn)), ++ ActivationBuilder::generate(activation, ofm_tensor->handle())); + } + + void KernelGenerator::visit(const ir::operation::AvgPool2D &node) +@@ -260,19 +275,21 @@ void KernelGenerator::visit(const ir::operation::AvgPool2D &node) + VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl; + VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + + ::arm_compute::PoolingLayerInfo info{ + ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh}, +- acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */}; ++ ifm_tensor->info()->data_layout(), acl_common::asPadStrideInfo(padding, stride), ++ true /* exclude_padding */}; + + auto fn = std::make_unique<::arm_compute::CLPoolingLayer>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info); + + _return_fn = std::make_unique( +- asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); ++ asAclClFunction(std::move(fn)), ++ ActivationBuilder::generate(activation, ofm_tensor->handle())); + } + + void KernelGenerator::visit(const ir::operation::Concat &node) +@@ -296,7 +313,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node) + return; + } + +- auto output_alloc = _tensor_builder->at(ofm_index).get(); ++ auto output_tensor = _tensor_builder->at(ofm_index).get(); + std::vector<::arm_compute::ICLTensor *> input_tensors; + for (auto &ifm_ind : input_indexes) + input_tensors.emplace_back(_tensor_builder->at(ifm_ind)->handle()); +@@ -305,7 +322,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node) + if (input_indexes.size() < 2) + { + auto l = std::make_unique<::arm_compute::CLCopy>(); +- l->configure(input_tensors.at(0), output_alloc->handle()); ++ l->configure(input_tensors.at(0), output_tensor->handle()); + fn = std::move(l); + } + else +@@ -313,10 +330,10 @@ void KernelGenerator::visit(const ir::operation::Concat &node) + auto l = std::make_unique<::arm_compute::CLConcatenateLayer>(); + const auto rank = _ctx.at(ofm_index).shape().rank(); + const auto frontend_layout = _current_op_seq_layout; +- const auto backend_layout = output_alloc->layout(); ++ const auto backend_layout = output_tensor->layout(); + const auto fixed_axis = + acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value(); +- l->configure(input_tensors, output_alloc->handle(), fixed_axis); ++ l->configure(input_tensors, output_tensor->handle(), fixed_axis); + fn = std::move(l); + } + +@@ -327,75 +344,15 @@ void KernelGenerator::visit(const ir::operation::Concat &node) + + void KernelGenerator::visit(const ir::operation::FullyConnected &node) + { +- using ir::operation::FullyConnected; +- + const auto output_index{node.getOutputs().at(0)}; +- const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)}; +- const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)}; +- const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)}; +- +- const auto input_rank = _ctx.at(input_index).shape().rank(); +- +- const auto output_size = +- _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 1); +- UNUSED_RELEASE(output_size); +- assert(_ctx.at(bias_index).shape().dim(0) == output_size); +- assert(_ctx.at(weight_index).shape().dim(0) == output_size); +- const auto batch_size = +- _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 2); +- const auto input_size = +- _ctx.at(weight_index).shape().dim(_ctx.at(weight_index).shape().rank() - 1); +- +- // Check for reshaping input's shape into rank-2 +- bool needs_reshape = false; +- ir::Shape reshape(2); +- if (input_rank == 3 || input_rank == 4) +- { +- const auto &ifm_shape = _ctx.at(input_index).shape(); +- auto feature_size = 1; +- for (int i = 0; i < ifm_shape.rank(); ++i) +- { +- feature_size *= ifm_shape.dim(i); +- } +- +- UNUSED_RELEASE(feature_size); +- assert(feature_size == batch_size * input_size); +- +- // for reshaping +- needs_reshape = true; +- reshape.dim(0) = batch_size; /* H */ +- reshape.dim(1) = input_size; /* W */ +- } +- ++ auto output_tensor = _tensor_builder->at(output_index).get(); + const auto activation = node.param().activation; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- const auto input_alloc = _tensor_builder->at(input_index).get(); +- const auto weight_alloc = _tensor_builder->at(weight_index).get(); +- const auto bias_alloc = _tensor_builder->at(bias_index).get(); +- const auto frontend_layout = _current_op_seq_layout; +- const auto acl_layout = output_alloc->handle()->info()->data_layout(); +- +- auto fn = std::make_unique( +- _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); +- +- arm_compute::CLFullyConnectedReshapingLayer::KernelType kernel_type = +- arm_compute::CLFullyConnectedReshapingLayer::KernelType::GENERAL; +- if (_ctx.at(weight_index).isConstant()) +- { +- kernel_type = arm_compute::CLFullyConnectedReshapingLayer::KernelType::PREPROCESSED_WEIGHTS; +- assert(_ctx.at(weight_index).data()); +- } +- fn->configure( +- input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(), output_alloc->handle(), +- needs_reshape, +- ::onert::backend::acl_common::asTensorShape( +- reshape, frontend_layout, ::onert::backend::acl_common::asRuntimeLayout(acl_layout)), +- kernel_type); +- ++ auto fn = acl_common::kernelGenFullyConnected( ++ node, _ctx, _tensor_builder, _current_op_seq_layout); + _return_fn = std::make_unique( +- asAclClFunction(std::move(fn)), +- ActivationBuilder::generate(activation, output_alloc->handle())); ++ std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle())); + } + + void KernelGenerator::visit(const ir::operation::Mul &node) +@@ -406,17 +363,18 @@ void KernelGenerator::visit(const ir::operation::Mul &node) + + const auto activation = node.param().activation; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto lhs_alloc = _tensor_builder->at(lhs_index).get(); +- auto rhs_alloc = _tensor_builder->at(rhs_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto lhs_tensor = _tensor_builder->at(lhs_index).get(); ++ auto rhs_tensor = _tensor_builder->at(rhs_index).get(); + + auto fn = std::make_unique<::arm_compute::CLPixelWiseMultiplication>(); + +- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale ++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale + arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + + _return_fn = std::make_unique( +- asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); ++ asAclClFunction(std::move(fn)), ++ ActivationBuilder::generate(activation, ofm_tensor->handle())); + } + + void KernelGenerator::visit(const ir::operation::Reduce &node) +@@ -427,14 +385,14 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) + const auto keep_dims{node.param().keep_dims}; + const auto reduce_type = node.param().reduce_type; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); + + // Convert to ACL axes taking into account negative values and possible duplicates. + const auto &axes = _ctx.at(axes_index); + const auto input_rank = _ctx.at(input_index).shape().rank(); + const auto frontend_layout = _current_op_seq_layout; +- const auto backend_layout = input_alloc->layout(); ++ const auto backend_layout = input_tensor->layout(); + + std::unique_ptr fn; + if (reduce_type == ir::operation::Reduce::ReduceType::MEAN) +@@ -443,7 +401,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) + + const auto acl_axes = + acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout); +- l->configure(input_alloc->handle(), acl_axes, keep_dims, output_alloc->handle()); ++ l->configure(input_tensor->handle(), acl_axes, keep_dims, output_tensor->handle()); + + fn = std::move(l); + } +@@ -453,7 +411,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + + const auto acl_axes = acl_common::asSet(axes, input_rank, frontend_layout, backend_layout); +- l->configure(input_alloc->handle(), output_alloc->handle(), acl_axes, keep_dims, ++ l->configure(input_tensor->handle(), output_tensor->handle(), acl_axes, keep_dims, + acl_common::convertReduceType(reduce_type)); + + fn = std::move(l); +@@ -469,13 +427,13 @@ void KernelGenerator::visit(const ir::operation::Reshape &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); + + // NOTE This operation must not be changed the layout from frontend to backend + // So, PermutationOperationPass makes layouts of frontend and backend the same. + const auto frontend_layout = _current_op_seq_layout; +- const auto backend_layout = output_alloc->layout(); ++ const auto backend_layout = output_tensor->layout(); + assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) || + frontend_layout == backend_layout); + UNUSED_RELEASE(frontend_layout); +@@ -483,7 +441,7 @@ void KernelGenerator::visit(const ir::operation::Reshape &node) + + auto fn = std::make_unique<::arm_compute::CLReshapeLayer>(); + +- fn->configure(input_alloc->handle(), output_alloc->handle()); ++ fn->configure(input_tensor->handle(), output_tensor->handle()); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -503,10 +461,10 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node) + (void)dims; + (void)ndim; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); + auto fn = std::make_unique(); +- fn->configure(input_alloc->handle(), output_alloc->handle()); ++ fn->configure(input_tensor->handle(), output_tensor->handle()); + auto acl_fn = asAclClFunction(std::move(fn)); + _return_fn = std::move(acl_fn); + } +@@ -516,15 +474,15 @@ void KernelGenerator::visit(const ir::operation::Tanh &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); + + auto fn = std::make_unique(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f}; + +- fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); ++ fn->configure(input_tensor->handle(), output_tensor->handle(), act_info); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -538,13 +496,13 @@ void KernelGenerator::visit(const ir::operation::Softmax &node) + + const auto beta = node.param().beta; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); + + auto fn = std::make_unique<::arm_compute::CLSoftmaxLayer>( + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + +- fn->configure(input_alloc->handle(), output_alloc->handle(), beta); ++ fn->configure(input_tensor->handle(), output_tensor->handle(), beta); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -558,10 +516,10 @@ void KernelGenerator::visit(const ir::operation::Slice &node) + const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)}; + const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)}; + +- auto outputData_alloc = _tensor_builder->at(output_index).get(); +- auto inputData_alloc = _tensor_builder->at(input_index).get(); ++ auto outputData_tensor = _tensor_builder->at(output_index).get(); ++ auto inputData_tensor = _tensor_builder->at(input_index).get(); + const auto frontend_layout = _current_op_seq_layout; +- const auto backend_layout = inputData_alloc->layout(); ++ const auto backend_layout = inputData_tensor->layout(); + + // Set initializers for indices data such as order of inputData + int input_rank = _ctx.at(input_index).shape().rank(); +@@ -613,7 +571,7 @@ void KernelGenerator::visit(const ir::operation::Slice &node) + + auto fn = std::make_unique<::arm_compute::CLSlice>(); + +- fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set); ++ fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -628,10 +586,10 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node) + const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)}; + const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)}; + +- auto outputData_alloc = _tensor_builder->at(output_index).get(); +- auto inputData_alloc = _tensor_builder->at(input_index).get(); ++ auto outputData_tensor = _tensor_builder->at(output_index).get(); ++ auto inputData_tensor = _tensor_builder->at(input_index).get(); + const auto frontend_layout = _current_op_seq_layout; +- const auto backend_layout = inputData_alloc->layout(); ++ const auto backend_layout = inputData_tensor->layout(); + + // Set initializers for indices data such as order of inputData + int input_rank = _ctx.at(input_index).shape().rank(); +@@ -704,7 +662,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node) + + auto fn = std::make_unique<::arm_compute::CLStridedSlice>(); + +- fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set, ++ fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, + strides_set, begin_mask, end_mask, shrink_axis_mask); + + auto acl_fn = asAclClFunction(std::move(fn)); +@@ -720,10 +678,10 @@ void KernelGenerator::visit(const ir::operation::Transpose &node) + + const auto rank = _ctx.at(ifm_idx).shape().rank(); + +- auto ofm_alloc = _tensor_builder->at(ofm_idx).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_idx).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_idx).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_idx).get(); + const auto frontend_layout = _current_op_seq_layout; +- const auto backend_layout = ifm_alloc->layout(); ++ const auto backend_layout = ifm_tensor->layout(); + + std::vector pv(perm.cbegin(), perm.cend()); + // Reversed +@@ -732,7 +690,7 @@ void KernelGenerator::visit(const ir::operation::Transpose &node) + + auto fn = std::make_unique<::arm_compute::CLPermute>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), backend_pv); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -747,17 +705,18 @@ void KernelGenerator::visit(const ir::operation::Add &node) + + const auto activation = node.param().activation; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto lhs_alloc = _tensor_builder->at(lhs_index).get(); +- auto rhs_alloc = _tensor_builder->at(rhs_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto lhs_tensor = _tensor_builder->at(lhs_index).get(); ++ auto rhs_tensor = _tensor_builder->at(rhs_index).get(); + + auto fn = std::make_unique<::arm_compute::CLArithmeticAddition>(); + +- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), ++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), + arm_compute::ConvertPolicy::SATURATE); + + _return_fn = std::make_unique( +- asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); ++ asAclClFunction(std::move(fn)), ++ ActivationBuilder::generate(activation, ofm_tensor->handle())); + } + + void KernelGenerator::visit(const ir::operation::Sub &node) +@@ -768,17 +727,18 @@ void KernelGenerator::visit(const ir::operation::Sub &node) + + const auto activation = node.param().activation; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto lhs_alloc = _tensor_builder->at(lhs_index).get(); +- auto rhs_alloc = _tensor_builder->at(rhs_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto lhs_tensor = _tensor_builder->at(lhs_index).get(); ++ auto rhs_tensor = _tensor_builder->at(rhs_index).get(); + + auto fn = std::make_unique<::arm_compute::CLArithmeticSubtraction>(); + +- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), ++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), + arm_compute::ConvertPolicy::SATURATE); + + _return_fn = std::make_unique( +- asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); ++ asAclClFunction(std::move(fn)), ++ ActivationBuilder::generate(activation, ofm_tensor->handle())); + } + + void KernelGenerator::visit(const ir::operation::Div &node) +@@ -789,16 +749,17 @@ void KernelGenerator::visit(const ir::operation::Div &node) + + const auto activation = node.param().activation; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto lhs_alloc = _tensor_builder->at(lhs_index).get(); +- auto rhs_alloc = _tensor_builder->at(rhs_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto lhs_tensor = _tensor_builder->at(lhs_index).get(); ++ auto rhs_tensor = _tensor_builder->at(rhs_index).get(); + + auto fn = std::make_unique<::arm_compute::CLArithmeticDivision>(); + +- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); ++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); + + _return_fn = std::make_unique( +- asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); ++ asAclClFunction(std::move(fn)), ++ ActivationBuilder::generate(activation, ofm_tensor->handle())); + } + + void KernelGenerator::visit(const ir::operation::Exp &node) +@@ -806,12 +767,12 @@ void KernelGenerator::visit(const ir::operation::Exp &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); + + auto fn = std::make_unique<::arm_compute::CLExpLayer>(); + +- fn->configure(input_alloc->handle(), output_alloc->handle()); ++ fn->configure(input_tensor->handle(), output_tensor->handle()); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -823,12 +784,12 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); + + auto fn = std::make_unique<::arm_compute::CLReshapeLayer>(); + +- fn->configure(input_alloc->handle(), output_alloc->handle()); ++ fn->configure(input_tensor->handle(), output_tensor->handle()); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -842,20 +803,21 @@ void KernelGenerator::visit(const ir::operation::InstanceNorm &node) + const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)}; + const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); +- auto gamma_alloc = _tensor_builder->at(gamma_index).get(); +- auto beta_alloc = _tensor_builder->at(beta_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); ++ auto gamma_tensor = _tensor_builder->at(gamma_index).get(); ++ auto beta_tensor = _tensor_builder->at(beta_index).get(); + auto epsilon = node.param().epsilon; + auto activation = node.param().activation; + + auto fn = std::make_unique<::arm_compute::CLInstanceNormalizationLayerEx>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), gamma_alloc->handle(), +- beta_alloc->handle(), epsilon); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), ++ beta_tensor->handle(), epsilon); + + _return_fn = std::make_unique( +- asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); ++ asAclClFunction(std::move(fn)), ++ ActivationBuilder::generate(activation, ofm_tensor->handle())); + } + + void KernelGenerator::visit(const ir::operation::Logistic &node) +@@ -863,15 +825,15 @@ void KernelGenerator::visit(const ir::operation::Logistic &node) + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC}; + + auto fn = std::make_unique<::arm_compute::CLActivationLayer>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -884,13 +846,13 @@ void KernelGenerator::visit(const ir::operation::LogicalAnd &node) + const auto input0_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT0)}; + const auto input1_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT1)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input0_alloc = _tensor_builder->at(input0_index).get(); +- auto input1_alloc = _tensor_builder->at(input1_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input0_tensor = _tensor_builder->at(input0_index).get(); ++ auto input1_tensor = _tensor_builder->at(input1_index).get(); + + auto fn = std::make_unique<::arm_compute::CLBinaryLogicalOp>(); + +- fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(), ++ fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(), + ::arm_compute::BinaryLogicalOperation::AND); + + auto acl_fn = asAclClFunction(std::move(fn)); +@@ -900,159 +862,8 @@ void KernelGenerator::visit(const ir::operation::LogicalAnd &node) + + void KernelGenerator::visit(const ir::operation::LSTM &node) + { +- // TODO Support dynamic rnn +- // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection. +- const auto scratch_buffer_index{ +- node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)}; +- const auto output_state_out_index{ +- node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)}; +- const auto cell_state_out_index{ +- node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)}; +- const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)}; +- +- const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)}; +- const auto input_to_input_weights_index{ +- node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional +- const auto input_to_forget_weights_index{ +- node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)}; +- const auto input_to_cell_weights_index{ +- node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)}; +- const auto input_to_output_weights_index{ +- node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)}; +- const auto recurrent_to_input_weights_index{ +- node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional +- const auto recurrent_to_forget_weights_index{ +- node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)}; +- const auto recurrent_to_cell_weights_index{ +- node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)}; +- const auto recurrent_to_output_weights_index{ +- node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)}; +- const auto cell_to_input_weights_index{ +- node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional +- const auto cell_to_forget_weights_index{ +- node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional +- const auto cell_to_output_weights_index{ +- node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional +- const auto input_gate_bias_index{ +- node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)}; +- const auto forget_gate_bias_index{ +- node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)}; +- const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)}; +- const auto output_gate_bias_index{ +- node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)}; +- const auto projection_weights_index{ +- node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional +- const auto projection_bias_index{ +- node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional +- const auto output_state_in_index{ +- node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)}; +- const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)}; +- const auto cell_threshold = node.param().cell_threshold; +- const auto projection_threshold = node.param().projection_threshold; +- +- bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 && +- _ctx.at(input_to_input_weights_index).shape().dim(1) != 0; +- bool has_recurrent_to_input_weights = +- _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 && +- _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0; +- bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0; +- bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0; +- bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 && +- _ctx.at(projection_weights_index).shape().dim(1) != 0; +- bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0); +- +- // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG. +- // true: no CIFG +- // false: CIFG +- // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG). +- bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights; +- +- // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole. +- // But the cell_to_input_weights does not exist in regular CIFG although peephole. +- // true: peephole +- // false: no peephole +- bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights; +- +- // NOTE Although the projection weights has data the projection bias may not have data. +- bool has_projection_param = has_projection_weights; +- +- const auto activation = node.param().activation; +- const auto cell_clip = cell_threshold; +- const auto projection_clip = projection_threshold; +- assert(cell_clip >= 0.f && projection_clip >= 0.f); +- +- auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get(); +- auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get(); +- auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get(); +- auto output_alloc = _tensor_builder->at(output_index).get(); +- +- auto input_alloc = _tensor_builder->at(input_index).get(); +- +- auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get(); +- auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get(); +- auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get(); +- auto recurrent_to_forget_weights_alloc = +- _tensor_builder->at(recurrent_to_forget_weights_index).get(); +- auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get(); +- auto recurrent_to_output_weights_alloc = +- _tensor_builder->at(recurrent_to_output_weights_index).get(); +- +- auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get(); +- auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get(); +- auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get(); +- auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get(); +- auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get(); +- +- auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation); +- +- auto fn = std::make_unique<::arm_compute::CLLSTMLayer>(); +- +- ::arm_compute::LSTMParams<::arm_compute::ICLTensor> lstm_params{}; +- if (has_cifg_param) +- { +- auto input_to_input_weights_alloc = +- _tensor_builder->at(input_to_input_weights_index).get(); // optional +- auto recurrent_to_input_weights_alloc = +- _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional +- auto cell_to_input_weights_handle = +- has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle() +- : nullptr; // optional (non-cifg && peephole) +- auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional +- lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(), +- recurrent_to_input_weights_alloc->handle(), +- cell_to_input_weights_handle, input_gate_bias_alloc->handle()); +- } +- if (has_peephole_param) +- { +- auto cell_to_forget_weights_alloc = +- _tensor_builder->at(cell_to_forget_weights_index).get(); // optional +- auto cell_to_output_weights_alloc = +- _tensor_builder->at(cell_to_output_weights_index).get(); // optional +- lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(), +- cell_to_output_weights_alloc->handle()); +- } +- if (has_projection_param) +- { +- auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional +- auto projection_bias_handle = has_projection_bias +- ? _tensor_builder->at(projection_bias_index).get()->handle() +- : nullptr; // optional +- lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle); +- } +- +- fn->configure( +- input_alloc->handle(), input_to_forget_weights_alloc->handle(), +- input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(), +- recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(), +- recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(), +- cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(), +- cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(), +- output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(), +- lstm_params, act_info, cell_clip, projection_clip); +- +- auto acl_fn = asAclClFunction(std::move(fn)); +- +- _return_fn = std::move(acl_fn); ++ _return_fn = acl_common::kernelGenLSTM(node, _ctx, _tensor_builder); + } + + void KernelGenerator::visit(const ir::operation::Comparison &node) +@@ -1063,13 +874,13 @@ void KernelGenerator::visit(const ir::operation::Comparison &node) + + const auto comparison_type = node.param().comparison_type; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input0_alloc = _tensor_builder->at(input0_index).get(); +- auto input1_alloc = _tensor_builder->at(input1_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input0_tensor = _tensor_builder->at(input0_index).get(); ++ auto input1_tensor = _tensor_builder->at(input1_index).get(); + + auto fn = std::make_unique<::arm_compute::CLComparison>(); + +- fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(), ++ fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(), + (arm_compute::ComparisonOperation)comparison_type); + + auto acl_fn = asAclClFunction(std::move(fn)); +@@ -1107,13 +918,13 @@ void KernelGenerator::visit(const ir::operation::Pack &node) + for (const auto &input_index : input_indexes) + { + size_t input_rank = _ctx.at(input_index).shape().rank(); +- const auto &input_alloc = _tensor_builder->at(input_index); +- orig_inputs_acl_tensor_shapes.emplace_back(input_alloc->info()->tensor_shape()); +- assert(input_rank == input_alloc->num_dimensions()); +- if (input_rank != input_alloc->info()->num_dimensions()) ++ const auto &input_tensor = _tensor_builder->at(input_index); ++ orig_inputs_acl_tensor_shapes.emplace_back(input_tensor->info()->tensor_shape()); ++ assert(input_rank == input_tensor->num_dimensions()); ++ if (input_rank != input_tensor->info()->num_dimensions()) + { + // This means that high dimension's value is 1 and ifm tensor is applied dim_correction +- input_alloc->info()->set_tensor_shape(acl_common::asTensorShape( ++ input_tensor->info()->set_tensor_shape(acl_common::asTensorShape( + _ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false)); + } + } +@@ -1135,8 +946,8 @@ void KernelGenerator::visit(const ir::operation::Permute &node) + const auto ofm_idx{node.getOutputs().at(0)}; + const auto ifm_idx{node.getInputs().at(0)}; + const auto permute_type = node.getPermuteType(); +- auto ofm_alloc = _tensor_builder->at(ofm_idx).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_idx).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_idx).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_idx).get(); + const auto rank = _ctx.at(ofm_idx).shape().rank(); + assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank()); + +@@ -1149,7 +960,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node) + + auto l = std::make_unique<::arm_compute::CLPermute>(); + +- l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv); ++ l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv); + + fn = std::move(l); + } +@@ -1160,7 +971,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node) + + auto l = std::make_unique<::arm_compute::CLPermute>(); + +- l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv); ++ l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv); + + fn = std::move(l); + } +@@ -1168,7 +979,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node) + { + auto l = std::make_unique<::arm_compute::CLCopy>(); + +- l->configure(ifm_alloc->handle(), ofm_alloc->handle()); ++ l->configure(ifm_tensor->handle(), ofm_tensor->handle()); + + fn = std::move(l); + } +@@ -1183,12 +994,12 @@ void KernelGenerator::visit(const ir::operation::RSQRT &node) + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + + auto fn = std::make_unique<::arm_compute::CLRsqrtLayer>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle()); + + _return_fn = asAclClFunction(std::move(fn)); + } +@@ -1198,15 +1009,15 @@ void KernelGenerator::visit(const ir::operation::ReLU &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::ReLU::Input::INPUT)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); + + auto fn = std::make_unique(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU}; + +- fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); ++ fn->configure(input_tensor->handle(), output_tensor->handle(), act_info); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -1219,12 +1030,12 @@ void KernelGenerator::visit(const ir::operation::ResizeBilinear &node) + + const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + + auto fn = std::make_unique<::arm_compute::CLScale>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), + ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE, + ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT); + +@@ -1238,15 +1049,15 @@ void KernelGenerator::visit(const ir::operation::ReLU1 &node) + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::ReLU1::Input::INPUT)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f}; + + auto fn = std::make_unique<::arm_compute::CLActivationLayer>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -1258,15 +1069,15 @@ void KernelGenerator::visit(const ir::operation::ReLU6 &node) + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::ReLU6::Input::INPUT)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f}; + + auto fn = std::make_unique<::arm_compute::CLActivationLayer>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -1288,25 +1099,25 @@ void KernelGenerator::visit(const ir::operation::RNN &node) + + const auto activation = node.param().activation; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto hidden_state_out_tensor = _tensor_builder->at(hidden_state_out_index).get(); + +- auto input_alloc = _tensor_builder->at(input_index).get(); +- auto weights_alloc = _tensor_builder->at(weights_index).get(); +- auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get(); +- auto bias_alloc = _tensor_builder->at(bias_index).get(); +- auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); ++ auto weights_tensor = _tensor_builder->at(weights_index).get(); ++ auto recurrent_weights_tensor = _tensor_builder->at(recurrent_weights_index).get(); ++ auto bias_tensor = _tensor_builder->at(bias_index).get(); ++ auto hidden_state_in_tensor = _tensor_builder->at(hidden_state_in_index).get(); + auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation); + + auto copy_layer = std::make_unique<::arm_compute::CLCopy>(); +- copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle()); ++ copy_layer->configure(hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle()); + _return_fn = asAclClFunction(std::move(copy_layer)); + +- auto fn = std::make_unique<::arm_compute::CLRNNLayerEx>( ++ auto fn = std::make_unique<::arm_compute::CLRNNLayer>( + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); +- fn->configure(input_alloc->handle(), weights_alloc->handle(), recurrent_weights_alloc->handle(), +- bias_alloc->handle(), hidden_state_out_alloc->handle(), output_alloc->handle(), +- act_info); ++ fn->configure(input_tensor->handle(), weights_tensor->handle(), ++ recurrent_weights_tensor->handle(), bias_tensor->handle(), ++ hidden_state_out_tensor->handle(), output_tensor->handle(), act_info); + _return_fn = asAclClFunction(std::move(fn)); + } + +@@ -1315,12 +1126,12 @@ void KernelGenerator::visit(const ir::operation::Floor &node) + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + + auto fn = std::make_unique<::arm_compute::CLFloor>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle()); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -1335,10 +1146,10 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node) + node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)}; + const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); +- auto block_size_alloc = _tensor_builder->at(block_size_index).get(); +- auto paddings_alloc = _tensor_builder->at(paddings_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); ++ auto block_size_tensor = _tensor_builder->at(block_size_index).get(); ++ auto paddings_tensor = _tensor_builder->at(paddings_index).get(); + + assert(_ctx.at(block_size_index).data()); + assert(_ctx.at(paddings_index).data()); +@@ -1346,8 +1157,8 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node) + std::unique_ptr<::arm_compute::IFunction> fn; + + auto l = std::make_unique<::arm_compute::CLSpaceToBatchLayer>(); +- l->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(), +- ofm_alloc->handle()); ++ l->configure(ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(), ++ ofm_tensor->handle()); + fn = std::move(l); + + auto acl_fn = asAclClFunction(std::move(fn)); +@@ -1362,12 +1173,12 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node) + + auto block_size = node.param().block_size; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + +- auto fn = std::make_unique<::arm_compute::CLSpaceToDepth>(); ++ auto fn = std::make_unique<::arm_compute::CLSpaceToDepthLayer>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), block_size); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -1389,19 +1200,21 @@ void KernelGenerator::visit(const ir::operation::L2Pool2D &node) + ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); + const auto activation = node.param().activation; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + + ::arm_compute::PoolingLayerInfo info{ + ::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh}, ++ ifm_tensor->info()->data_layout(), + ::onert::backend::acl_common::asPadStrideInfo(padding, stride)}; + + auto fn = std::make_unique<::arm_compute::CLPoolingLayer>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info); + + _return_fn = std::make_unique( +- asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); ++ asAclClFunction(std::move(fn)), ++ ActivationBuilder::generate(activation, ofm_tensor->handle())); + } + + void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node) +@@ -1410,13 +1223,13 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node) + const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)}; + const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto lookups_alloc = _tensor_builder->at(lookups_index).get(); +- auto values_alloc = _tensor_builder->at(values_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto lookups_tensor = _tensor_builder->at(lookups_index).get(); ++ auto values_tensor = _tensor_builder->at(values_index).get(); + + auto fn = std::make_unique<::arm_compute::CLEmbeddingLookup>(); + +- fn->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle()); ++ fn->configure(values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle()); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -1442,15 +1255,15 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node) + float beta = 0.5f; // pow(reduction, -0.5) = 1 / sqrt(reduction) + float bias = 0.0f; // Don't offset the reduction. + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + + const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP, + radius, alpha, beta, bias, false); + + auto fn = std::make_unique<::arm_compute::CLNormalizationLayer>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -1466,17 +1279,17 @@ void KernelGenerator::visit(const ir::operation::HashtableLookup &node) + const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)}; + const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto hits_alloc = _tensor_builder->at(hits_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto hits_tensor = _tensor_builder->at(hits_index).get(); + +- auto lookups_alloc = _tensor_builder->at(lookups_index).get(); +- auto keys_alloc = _tensor_builder->at(keys_index).get(); +- auto values_alloc = _tensor_builder->at(values_index).get(); ++ auto lookups_tensor = _tensor_builder->at(lookups_index).get(); ++ auto keys_tensor = _tensor_builder->at(keys_index).get(); ++ auto values_tensor = _tensor_builder->at(values_index).get(); + + auto fn = std::make_unique<::arm_compute::CLHashtableLookup>(); + +- fn->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(), +- output_alloc->handle(), hits_alloc->handle()); ++ fn->configure(lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(), ++ output_tensor->handle(), hits_tensor->handle()); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -1489,13 +1302,13 @@ void KernelGenerator::visit(const ir::operation::PReLU &node) + const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)}; + const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); +- auto alpha_alloc = _tensor_builder->at(alpha_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); ++ auto alpha_tensor = _tensor_builder->at(alpha_index).get(); + +- auto fn = std::make_unique<::arm_compute::CLPReLU>(); ++ auto fn = std::make_unique<::arm_compute::CLPReluLayer>(); + +- fn->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle()); ++ fn->configure(ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle()); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -1518,7 +1331,6 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node) + (node.param().padding.type == ir::PaddingType::VALID)); + auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride, + ker_shape.W, ker_shape.H); +- + uint32_t invalid_horizontal = 0; + uint32_t invalid_vertical = 0; + if (node.param().padding.type == ir::PaddingType::VALID) +@@ -1528,17 +1340,17 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node) + invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1); + } + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); +- auto ker_alloc = _tensor_builder->at(ker_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); ++ auto ker_tensor = _tensor_builder->at(ker_index).get(); + + const auto tconv_info = acl_common::asPadStrideInfo(padding, stride); + + auto fn = std::make_unique<::arm_compute::CLTransposeConvLayer>( + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + +- fn->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info, +- invalid_horizontal, invalid_vertical); ++ fn->configure(ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(), ++ tconv_info, invalid_horizontal, invalid_vertical); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -1550,15 +1362,15 @@ void KernelGenerator::visit(const ir::operation::SQRT &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::SQRT::Input::INPUT)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT}; + + auto fn = std::make_unique<::arm_compute::CLActivationLayer>(); + +- fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); ++ fn->configure(input_tensor->handle(), output_tensor->handle(), act_info); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -1571,13 +1383,13 @@ void KernelGenerator::visit(const ir::operation::LogicalOr &node) + const auto input0_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT0)}; + const auto input1_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT1)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input0_alloc = _tensor_builder->at(input0_index).get(); +- auto input1_alloc = _tensor_builder->at(input1_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input0_tensor = _tensor_builder->at(input0_index).get(); ++ auto input1_tensor = _tensor_builder->at(input1_index).get(); + + auto fn = std::make_unique<::arm_compute::CLBitwiseOr>(); + +- fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle()); ++ fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle()); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -1589,12 +1401,12 @@ void KernelGenerator::visit(const ir::operation::LogicalNot &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::LogicalNot::Input::INPUT)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); + + auto fn = std::make_unique<::arm_compute::CLBitwiseNot>(); + +- fn->configure(input_alloc->handle(), output_alloc->handle()); ++ fn->configure(input_tensor->handle(), output_tensor->handle()); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -1607,13 +1419,13 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node) + const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto lhs_alloc = _tensor_builder->at(lhs_index).get(); +- auto rhs_alloc = _tensor_builder->at(rhs_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto lhs_tensor = _tensor_builder->at(lhs_index).get(); ++ auto rhs_tensor = _tensor_builder->at(rhs_index).get(); + + auto fn = std::make_unique<::arm_compute::CLElementwiseSquaredDiff>(); + +- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); ++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -1634,13 +1446,13 @@ void KernelGenerator::visit(const ir::operation::TopKV2 &node) + + const auto k = node.param().k; + +- auto values_alloc = _tensor_builder->at(outputValues_index).get(); +- auto indices_alloc = _tensor_builder->at(outputIndices_index).get(); +- auto input_alloc = _tensor_builder->at(inputData_index).get(); ++ auto values_tensor = _tensor_builder->at(outputValues_index).get(); ++ auto indices_tensor = _tensor_builder->at(outputIndices_index).get(); ++ auto input_tensor = _tensor_builder->at(inputData_index).get(); + + auto fn = std::make_unique<::arm_compute::CLTopKV2>(); + +- fn->configure(input_alloc->handle(), k, values_alloc->handle(), indices_alloc->handle()); ++ fn->configure(input_tensor->handle(), k, values_tensor->handle(), indices_tensor->handle()); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -1659,9 +1471,9 @@ void KernelGenerator::visit(const ir::operation::Gather &node) + const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw); + const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value(); + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); +- auto indices_alloc = _tensor_builder->at(indices_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); ++ auto indices_tensor = _tensor_builder->at(indices_index).get(); + + // NOTE The frontend layout and backend layout must be the same for this operation. + // If not the same, we have to add a stage(?) to perform permutation of output tensor. It +@@ -1671,43 +1483,43 @@ void KernelGenerator::visit(const ir::operation::Gather &node) + // a model. For example, if a model in NHWC has this operation as output rank == 4, indices + // rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W + // and C are not sequential in NCHW. So the backend in NCHW cannot handle this case. +- const auto backend_layout = ofm_alloc->layout(); ++ const auto backend_layout = ofm_tensor->layout(); + UNUSED_RELEASE(backend_layout); +- assert(backend_layout == ifm_alloc->layout()); +- assert(backend_layout == indices_alloc->layout()); ++ assert(backend_layout == ifm_tensor->layout()); ++ assert(backend_layout == indices_tensor->layout()); + assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout); + + auto fn = std::make_unique<::arm_compute::CLGatherEx>(); + + // input is n-D, indices k-D, output is (n + k - 1)-D + size_t n = ifm_rank; +- assert(n == ifm_alloc->num_dimensions()); ++ assert(n == ifm_tensor->num_dimensions()); + size_t k = _ctx.at(indices_index).shape().rank(); +- assert(k == indices_alloc->num_dimensions()); ++ assert(k == indices_tensor->num_dimensions()); + + // Disable applied dim_correction +- const auto orig_ifm_acl_tensor_shape = ifm_alloc->info()->tensor_shape(); +- if (n != ifm_alloc->info()->num_dimensions()) ++ const auto orig_ifm_acl_tensor_shape = ifm_tensor->info()->tensor_shape(); ++ if (n != ifm_tensor->info()->num_dimensions()) + { + // This means that high dimension's value is 1 and ifm tensor is applied dim_correction + const auto ifm = _ctx.at(ifm_index); +- ifm_alloc->info()->set_tensor_shape( ++ ifm_tensor->info()->set_tensor_shape( + acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false)); + } +- const auto orig_indice_acl_tensor_shape = indices_alloc->info()->tensor_shape(); +- if (k != indices_alloc->info()->num_dimensions()) ++ const auto orig_indice_acl_tensor_shape = indices_tensor->info()->tensor_shape(); ++ if (k != indices_tensor->info()->num_dimensions()) + { + // This means that high dimension's value is 1 and indices tensor is applied dim_correction + const auto indices = _ctx.at(indices_index); +- indices_alloc->info()->set_tensor_shape( ++ indices_tensor->info()->set_tensor_shape( + acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false)); + } + +- fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis); ++ fn->configure(ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis); + + // Revert disabling applied dim_correction +- ifm_alloc->info()->set_tensor_shape(orig_ifm_acl_tensor_shape); +- indices_alloc->info()->set_tensor_shape(orig_indice_acl_tensor_shape); ++ ifm_tensor->info()->set_tensor_shape(orig_ifm_acl_tensor_shape); ++ indices_tensor->info()->set_tensor_shape(orig_indice_acl_tensor_shape); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -1719,12 +1531,12 @@ void KernelGenerator::visit(const ir::operation::Neg &node) + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + + auto fn = std::make_unique<::arm_compute::CLNeg>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle()); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -1736,15 +1548,15 @@ void KernelGenerator::visit(const ir::operation::Abs &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS}; + + auto fn = std::make_unique<::arm_compute::CLActivationLayer>(); + +- fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); ++ fn->configure(input_tensor->handle(), output_tensor->handle(), act_info); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -1761,11 +1573,11 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node) + + assert((ifm_shape.rank() - 1) == ofm_shape.rank()); + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + const auto ifm_rank = _ctx.at(ifm_index).shape().rank(); + auto frontend_layout = _current_op_seq_layout; +- auto backend_layout = ifm_alloc->layout(); ++ auto backend_layout = ifm_tensor->layout(); + + int axis_value = node.param().axis; + if (axis_value < 0) +@@ -1776,10 +1588,10 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node) + auto acl_axis = + acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value(); + +- auto fn = std::make_unique<::arm_compute::CLArgOperation>(); ++ auto fn = std::make_unique<::arm_compute::CLArgMinMaxLayer>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), {acl_axis}, +- ::arm_compute::ArgOperation::MAX); ++ fn->configure(ifm_tensor->handle(), acl_axis, ofm_tensor->handle(), ++ ::arm_compute::ReductionOperation::ARG_IDX_MAX); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -1791,12 +1603,12 @@ void KernelGenerator::visit(const ir::operation::Dequantize &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Dequantize::Input::INPUT)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); + +- auto fn = std::make_unique<::arm_compute::CLCast>(); ++ auto fn = std::make_unique<::arm_compute::CLDequantizationLayer>(); + +- fn->configure(input_alloc->handle(), output_alloc->handle(), arm_compute::SubDataType::NONE); ++ fn->configure(input_tensor->handle(), output_tensor->handle()); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -1814,15 +1626,15 @@ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &nod + auto beta = node.param().beta; + auto bias = node.param().bias; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + + const auto norm_info = ::arm_compute::NormalizationLayerInfo( + ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false); + + auto fn = std::make_unique<::arm_compute::CLNormalizationLayer>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -1837,12 +1649,12 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node) + auto block_size = node.param().block_size; + assert(block_size > 0); + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); + +- auto fn = std::make_unique<::arm_compute::CLDepthToSpace>(); ++ auto fn = std::make_unique<::arm_compute::CLDepthToSpaceLayer>(); + +- fn->configure(input_alloc->handle(), output_alloc->handle(), block_size); ++ fn->configure(input_tensor->handle(), output_tensor->handle(), block_size); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -1860,13 +1672,13 @@ void KernelGenerator::visit(const ir::operation::Split &node) + for (const auto &output : node.getOutputs()) + output_indexes.emplace_back(output); + +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); +- std::vector output_allocs; ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); ++ std::vector output_tensors; + for (const auto &ofm_ind : output_indexes) +- output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle()); ++ output_tensors.emplace_back(_tensor_builder->at(ofm_ind).get()->handle()); + + const auto frontend_layout = _current_op_seq_layout; +- const auto backend_layout = ifm_alloc->layout(); ++ const auto backend_layout = ifm_tensor->layout(); + auto axis = node.param().axis; + if (axis < 0) + axis += ifm_rank; +@@ -1874,7 +1686,7 @@ void KernelGenerator::visit(const ir::operation::Split &node) + + auto fn = std::make_unique<::arm_compute::CLSplit>(); + +- fn->configure(ifm_alloc->handle(), output_allocs, axis); ++ fn->configure(ifm_tensor->handle(), output_tensors, axis); + + _return_fn = asAclClFunction(std::move(fn)); + } +@@ -1906,13 +1718,13 @@ void KernelGenerator::visit(const ir::operation::Unpack &node) + for (const auto &output_index : output_indexes) + { + size_t output_rank = _ctx.at(output_index).shape().rank(); +- const auto &output_alloc = _tensor_builder->at(output_index); +- orig_outputs_acl_tensor_shapes.emplace_back(output_alloc->info()->tensor_shape()); +- assert(output_rank == output_alloc->num_dimensions()); +- if (output_rank != output_alloc->info()->num_dimensions()) ++ const auto &output_tensor = _tensor_builder->at(output_index); ++ orig_outputs_acl_tensor_shapes.emplace_back(output_tensor->info()->tensor_shape()); ++ assert(output_rank == output_tensor->num_dimensions()); ++ if (output_rank != output_tensor->info()->num_dimensions()) + { + // This means that high dimension's value is 1 and ifm tensor is applied dim_correction +- output_alloc->info()->set_tensor_shape(acl_common::asTensorShape( ++ output_tensor->info()->set_tensor_shape(acl_common::asTensorShape( + _ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false)); + } + } +@@ -1959,12 +1771,12 @@ void KernelGenerator::visit(const ir::operation::Pad &node) + + // Disable applied dim_correction + size_t input_rank = _ctx.at(input_index).shape().rank(); +- const auto &input_alloc = _tensor_builder->at(input_index); +- assert(input_rank == input_alloc->num_dimensions()); +- if (input_rank != input_alloc->info()->num_dimensions()) ++ const auto &input_tensor = _tensor_builder->at(input_index); ++ assert(input_rank == input_tensor->num_dimensions()); ++ if (input_rank != input_tensor->info()->num_dimensions()) + { + // This means that high dimension's value is 1 and ifm tensor is applied dim_correction +- input_alloc->info()->set_tensor_shape(acl_common::asTensorShape( ++ input_tensor->info()->set_tensor_shape(acl_common::asTensorShape( + _ctx.at(input_index).shape(), frontend_layout, backend_layout, false)); + } + +@@ -1982,13 +1794,13 @@ void KernelGenerator::visit(const ir::operation::Min &node) + const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto lhs_alloc = _tensor_builder->at(lhs_index).get(); +- auto rhs_alloc = _tensor_builder->at(rhs_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto lhs_tensor = _tensor_builder->at(lhs_index).get(); ++ auto rhs_tensor = _tensor_builder->at(rhs_index).get(); + + auto fn = std::make_unique<::arm_compute::CLElementwiseMin>(); + +- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); ++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -2001,13 +1813,13 @@ void KernelGenerator::visit(const ir::operation::Max &node) + const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto lhs_alloc = _tensor_builder->at(lhs_index).get(); +- auto rhs_alloc = _tensor_builder->at(rhs_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto lhs_tensor = _tensor_builder->at(lhs_index).get(); ++ auto rhs_tensor = _tensor_builder->at(rhs_index).get(); + + auto fn = std::make_unique<::arm_compute::CLElementwiseMax>(); + +- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); ++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); + + auto acl_fn = asAclClFunction(std::move(fn)); + +@@ -2019,12 +1831,12 @@ void KernelGenerator::visit(const ir::operation::ConvertFp32ToFp16 &node) + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp32ToFp16::Input::INPUT)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + + auto fn = std::make_unique<::arm_compute::CLDepthConvertLayer>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), ::arm_compute::ConvertPolicy::SATURATE, ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, + 0); + + auto acl_fn = asAclClFunction(std::move(fn)); +@@ -2037,12 +1849,12 @@ void KernelGenerator::visit(const ir::operation::ConvertFp16ToFp32 &node) + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp16ToFp32::Input::INPUT)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + + auto fn = std::make_unique<::arm_compute::CLDepthConvertLayer>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), ::arm_compute::ConvertPolicy::SATURATE, ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, + 0); + + auto acl_fn = asAclClFunction(std::move(fn)); +diff --git a/runtime/onert/backend/acl_common/AclKernelGen.h b/runtime/onert/backend/acl_common/AclKernelGen.h +new file mode 100644 +index 0000000..6253434 +--- /dev/null ++++ b/runtime/onert/backend/acl_common/AclKernelGen.h +@@ -0,0 +1,269 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#ifndef __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_ ++#define __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_ ++ ++#include ++#include ++ ++#include ++ ++namespace onert ++{ ++namespace backend ++{ ++namespace acl_common ++{ ++ ++template ++std::unique_ptr ++kernelGenLSTM(const ir::operation::LSTM &node, const ir::Operands &operands, ++ const std::shared_ptr &tensor_builder) ++{ ++ // TODO Support dynamic rnn ++ // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection. ++ const auto scratch_buffer_index{ ++ node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)}; ++ const auto output_state_out_index{ ++ node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)}; ++ const auto cell_state_out_index{ ++ node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)}; ++ const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)}; ++ ++ const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)}; ++ const auto input_to_input_weights_index{ ++ node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional ++ const auto input_to_forget_weights_index{ ++ node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)}; ++ const auto input_to_cell_weights_index{ ++ node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)}; ++ const auto input_to_output_weights_index{ ++ node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)}; ++ const auto recurrent_to_input_weights_index{ ++ node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional ++ const auto recurrent_to_forget_weights_index{ ++ node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)}; ++ const auto recurrent_to_cell_weights_index{ ++ node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)}; ++ const auto recurrent_to_output_weights_index{ ++ node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)}; ++ const auto cell_to_input_weights_index{ ++ node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional ++ const auto cell_to_forget_weights_index{ ++ node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional ++ const auto cell_to_output_weights_index{ ++ node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional ++ const auto input_gate_bias_index{ ++ node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)}; ++ const auto forget_gate_bias_index{ ++ node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)}; ++ const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)}; ++ const auto output_gate_bias_index{ ++ node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)}; ++ const auto projection_weights_index{ ++ node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional ++ const auto projection_bias_index{ ++ node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional ++ const auto output_state_in_index{ ++ node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)}; ++ const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)}; ++ const auto cell_threshold = node.param().cell_threshold; ++ const auto projection_threshold = node.param().projection_threshold; ++ ++ bool has_input_to_input_weights = operands.at(input_to_input_weights_index).shape().dim(0) != 0 && ++ operands.at(input_to_input_weights_index).shape().dim(1) != 0; ++ bool has_recurrent_to_input_weights = ++ operands.at(recurrent_to_input_weights_index).shape().dim(0) != 0 && ++ operands.at(recurrent_to_input_weights_index).shape().dim(1) != 0; ++ bool has_cell_to_forget_weights = operands.at(cell_to_forget_weights_index).shape().dim(0) != 0; ++ bool has_cell_to_output_weights = operands.at(cell_to_output_weights_index).shape().dim(0) != 0; ++ bool has_projection_weights = operands.at(projection_weights_index).shape().dim(0) != 0 && ++ operands.at(projection_weights_index).shape().dim(1) != 0; ++ bool has_projection_bias = operands.at(projection_bias_index).shape().dim(0); ++ ++ // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG. ++ // true: no CIFG ++ // false: CIFG ++ // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG). ++ bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights; ++ ++ // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole. ++ // But the cell_to_input_weights does not exist in regular CIFG although peephole. ++ // true: peephole ++ // false: no peephole ++ bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights; ++ ++ // NOTE Although the projection weights has data the projection bias may not have data. ++ bool has_projection_param = has_projection_weights; ++ ++ const auto activation = node.param().activation; ++ const auto cell_clip = cell_threshold; ++ const auto projection_clip = projection_threshold; ++ assert(cell_clip >= 0.f && projection_clip >= 0.f); ++ ++ auto scratch_buffer_tensor = tensor_builder->at(scratch_buffer_index).get(); ++ auto output_state_out_tensor = tensor_builder->at(output_state_out_index).get(); ++ auto cell_state_out_tensor = tensor_builder->at(cell_state_out_index).get(); ++ auto output_tensor = tensor_builder->at(output_index).get(); ++ ++ auto input_tensor = tensor_builder->at(input_index).get(); ++ ++ auto input_to_forget_weights_tensor = tensor_builder->at(input_to_forget_weights_index).get(); ++ auto input_to_cell_weights_tensor = tensor_builder->at(input_to_cell_weights_index).get(); ++ auto input_to_output_weights_tensor = tensor_builder->at(input_to_output_weights_index).get(); ++ auto recurrent_to_forget_weights_tensor = ++ tensor_builder->at(recurrent_to_forget_weights_index).get(); ++ auto recurrent_to_cell_weights_tensor = tensor_builder->at(recurrent_to_cell_weights_index).get(); ++ auto recurrent_to_output_weights_tensor = ++ tensor_builder->at(recurrent_to_output_weights_index).get(); ++ ++ auto forget_gate_bias_tensor = tensor_builder->at(forget_gate_bias_index).get(); ++ auto cell_bias_tensor = tensor_builder->at(cell_bias_index).get(); ++ auto output_gate_bias_tensor = tensor_builder->at(output_gate_bias_index).get(); ++ auto output_state_in_tensor = tensor_builder->at(output_state_in_index).get(); ++ auto cell_state_in_tensor = tensor_builder->at(cell_state_in_index).get(); ++ ++ auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation); ++ ++ auto fn = std::make_unique(); ++ ++ ::arm_compute::LSTMParams lstm_params{}; ++ if (has_cifg_param) ++ { ++ auto input_to_input_weights_tensor = ++ tensor_builder->at(input_to_input_weights_index).get(); // optional ++ auto recurrent_to_input_weights_tensor = ++ tensor_builder->at(recurrent_to_input_weights_index).get(); // optional ++ auto cell_to_input_weights_handle = ++ has_peephole_param ? tensor_builder->at(cell_to_input_weights_index).get()->handle() ++ : nullptr; // optional (non-cifg && peephole) ++ auto input_gate_bias_tensor = tensor_builder->at(input_gate_bias_index).get(); // optional ++ lstm_params.set_cifg_params(input_to_input_weights_tensor->handle(), ++ recurrent_to_input_weights_tensor->handle(), ++ cell_to_input_weights_handle, input_gate_bias_tensor->handle()); ++ } ++ if (has_peephole_param) ++ { ++ auto cell_to_forget_weights_tensor = ++ tensor_builder->at(cell_to_forget_weights_index).get(); // optional ++ auto cell_to_output_weights_tensor = ++ tensor_builder->at(cell_to_output_weights_index).get(); // optional ++ lstm_params.set_peephole_params(cell_to_forget_weights_tensor->handle(), ++ cell_to_output_weights_tensor->handle()); ++ } ++ if (has_projection_param) ++ { ++ auto projection_weights_tensor = tensor_builder->at(projection_weights_index).get(); // optional ++ auto projection_bias_handle = has_projection_bias ++ ? tensor_builder->at(projection_bias_index).get()->handle() ++ : nullptr; // optional ++ lstm_params.set_projection_params(projection_weights_tensor->handle(), projection_bias_handle); ++ } ++ ++ fn->configure(input_tensor->handle(), input_to_forget_weights_tensor->handle(), ++ input_to_cell_weights_tensor->handle(), input_to_output_weights_tensor->handle(), ++ recurrent_to_forget_weights_tensor->handle(), ++ recurrent_to_cell_weights_tensor->handle(), ++ recurrent_to_output_weights_tensor->handle(), forget_gate_bias_tensor->handle(), ++ cell_bias_tensor->handle(), output_gate_bias_tensor->handle(), ++ output_state_in_tensor->handle(), cell_state_in_tensor->handle(), ++ scratch_buffer_tensor->handle(), output_state_out_tensor->handle(), ++ cell_state_out_tensor->handle(), output_tensor->handle(), lstm_params, act_info, ++ cell_clip, projection_clip); ++ ++ return std::make_unique(std::move(fn)); ++} ++ ++template ++std::unique_ptr ++kernelGenFullyConnected(const ir::operation::FullyConnected &node, const ir::Operands &operands, ++ const std::shared_ptr &tensor_builder, ir::Layout layout) ++{ ++ using ir::operation::FullyConnected; ++ ++ const auto output_index{node.getOutputs().at(0)}; ++ const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)}; ++ const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)}; ++ const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)}; ++ ++ const auto input_rank = operands.at(input_index).shape().rank(); ++ ++ const auto output_size = ++ operands.at(output_index).shape().dim(operands.at(output_index).shape().rank() - 1); ++ UNUSED_RELEASE(output_size); ++ assert(operands.at(bias_index).shape().dim(0) == output_size); ++ assert(operands.at(weight_index).shape().dim(0) == output_size); ++ const auto batch_size = ++ operands.at(output_index).shape().dim(operands.at(output_index).shape().rank() - 2); ++ const auto input_size = ++ operands.at(weight_index).shape().dim(operands.at(weight_index).shape().rank() - 1); ++ ++ // Check for reshaping input's shape into rank-2 ++ bool needs_reshape = false; ++ ir::Shape reshape(2); ++ if (input_rank == 3 || input_rank == 4) ++ { ++ const auto &ifm_shape = operands.at(input_index).shape(); ++ auto feature_size = 1; ++ for (int i = 0; i < ifm_shape.rank(); ++i) ++ { ++ feature_size *= ifm_shape.dim(i); ++ } ++ ++ UNUSED_RELEASE(feature_size); ++ assert(feature_size == batch_size * input_size); ++ ++ // for reshaping ++ needs_reshape = true; ++ reshape.dim(0) = batch_size; /* H */ ++ reshape.dim(1) = input_size; /* W */ ++ } ++ ++ auto output_tensor = tensor_builder->at(output_index).get(); ++ const auto input_tensor = tensor_builder->at(input_index).get(); ++ const auto weight_tensor = tensor_builder->at(weight_index).get(); ++ const auto bias_tensor = tensor_builder->at(bias_index).get(); ++ const auto frontend_layout = layout; ++ const auto acl_layout = output_tensor->handle()->info()->data_layout(); ++ ++ auto fn = ++ std::make_unique(tensor_builder->acl_tensor_manager()->internal_buffer_manager()); ++ ++ typename T_ACLLayer::KernelType kernel_type = T_ACLLayer::KernelType::GENERAL; ++ if (operands.at(weight_index).isConstant()) ++ { ++ kernel_type = T_ACLLayer::KernelType::PREPROCESSED_WEIGHTS; ++ assert(operands.at(weight_index).data()); ++ } ++ ++ fn->configure( ++ input_tensor->handle(), weight_tensor->handle(), bias_tensor->handle(), ++ output_tensor->handle(), needs_reshape, ++ ::onert::backend::acl_common::asTensorShape( ++ reshape, frontend_layout, ::onert::backend::acl_common::asRuntimeLayout(acl_layout)), ++ kernel_type); ++ ++ return std::make_unique(std::move(fn)); ++} ++ ++} // namespace acl_common ++} // namespace backend ++} // namespace onert ++ ++#endif // __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_ +diff --git a/runtime/onert/backend/acl_neon/KernelGenerator.cc b/runtime/onert/backend/acl_neon/KernelGenerator.cc +index e471867..37ec993 100644 +--- a/runtime/onert/backend/acl_neon/KernelGenerator.cc ++++ b/runtime/onert/backend/acl_neon/KernelGenerator.cc +@@ -31,6 +31,7 @@ + #include "exec/NopFunction.h" + #include "util/logging.h" + #include "util/Utils.h" ++#include "AclKernelGen.h" + + namespace onert + { +@@ -74,15 +75,15 @@ void KernelGenerator::visit(const ir::operation::Abs &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS}; + + auto fn = std::make_unique<::arm_compute::NEActivationLayer>(); + +- fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); ++ fn->configure(input_tensor->handle(), output_tensor->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -96,10 +97,10 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node) + + const auto ifm_rank = _ctx.at(ifm_index).shape().rank(); + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto frontend_layout = _current_op_seq_layout; +- auto backend_layout = ifm_alloc->layout(); ++ auto backend_layout = ifm_tensor->layout(); + + int axis_value = node.param().axis; + if (axis_value < 0) +@@ -112,7 +113,7 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node) + + auto fn = std::make_unique<::arm_compute::NEArgMinMaxLayer>(); + +- fn->configure(ifm_alloc->handle(), fixed_axis, ofm_alloc->handle(), ++ fn->configure(ifm_tensor->handle(), fixed_axis, ofm_tensor->handle(), + arm_compute::ReductionOperation::ARG_IDX_MAX); + + auto acl_fn = asAclFunction(std::move(fn)); +@@ -127,15 +128,15 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node) + const auto block_size_index{ + node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); +- auto block_size_alloc = _tensor_builder->at(block_size_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); ++ auto block_size_tensor = _tensor_builder->at(block_size_index).get(); + + assert(_ctx.at(block_size_index).data()); + + auto fn = std::make_unique<::arm_compute::NEBatchToSpaceLayer>(); + +- fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), ofm_alloc->handle()); ++ fn->configure(ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -147,15 +148,26 @@ void KernelGenerator::visit(const ir::operation::Cast &node) + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + +- auto fn = std::make_unique<::arm_compute::NECast>(); ++ std::unique_ptr<::arm_compute::IFunction> fn; ++ if (ifm_tensor->data_type() == ofm_tensor->data_type()) ++ { ++ auto l = std::make_unique<::arm_compute::NECopy>(); ++ ++ l->configure(ifm_tensor->handle(), ofm_tensor->handle()); ++ ++ fn = std::move(l); ++ } ++ else ++ { ++ auto l = std::make_unique<::arm_compute::NECast>(); + +- auto input_sub_type = _ctx.at(ifm_index).typeInfo().type() == ir::DataType::BOOL8 +- ? arm_compute::SubDataType::BOOL +- : arm_compute::SubDataType::NONE; +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), input_sub_type); ++ l->configure(ifm_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE); ++ ++ fn = std::move(l); ++ } + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -183,10 +195,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) + ker_width, ker_height); + const auto activation = node.param().activation; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); +- auto ker_alloc = _tensor_builder->at(ker_index).get(); +- auto bias_alloc = _tensor_builder->at(bias_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); ++ auto ker_tensor = _tensor_builder->at(ker_index).get(); ++ auto bias_tensor = _tensor_builder->at(bias_index).get(); + + const auto conv_info = acl_common::asPadStrideInfo(padding, stride); + const auto act_info = acl_common::asActivationLayerInfo(activation); +@@ -194,8 +206,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) + auto fn = std::make_unique<::arm_compute::NEConvolutionLayer>( + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + +- fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(), +- conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info); ++ fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), ++ ofm_tensor->handle(), conv_info, ::arm_compute::WeightsInfo(), ++ ::arm_compute::Size2D(1U, 1U), act_info); + + _return_fn = asAclFunction(std::move(fn)); + } +@@ -208,12 +221,12 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node) + auto block_size = node.param().block_size; + assert(block_size > 0); + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); + +- auto fn = std::make_unique<::arm_compute::NEDepthToSpaceLayerEx>(); ++ auto fn = std::make_unique<::arm_compute::NEDepthToSpaceLayer>(); + +- fn->configure(input_alloc->handle(), output_alloc->handle(), block_size); ++ fn->configure(input_tensor->handle(), output_tensor->handle(), block_size); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -242,10 +255,10 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) + const auto multiplier = node.param().multiplier; + const auto activation = node.param().activation; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); +- auto ker_alloc = _tensor_builder->at(ker_index).get(); +- auto bias_alloc = _tensor_builder->at(bias_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); ++ auto ker_tensor = _tensor_builder->at(ker_index).get(); ++ auto bias_tensor = _tensor_builder->at(bias_index).get(); + + const auto conv_info = acl_common::asPadStrideInfo(padding, stride); + const auto act_info = acl_common::asActivationLayerInfo(activation); +@@ -253,8 +266,8 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) + { + auto fn = std::make_unique<::arm_compute::NEDepthwiseConvolutionLayer>(); + +- fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), +- ofm_alloc->handle(), conv_info, multiplier, act_info); ++ fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), ++ ofm_tensor->handle(), conv_info, multiplier, act_info); + + _return_fn = asAclFunction(std::move(fn)); + } +@@ -265,12 +278,12 @@ void KernelGenerator::visit(const ir::operation::Dequantize &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Dequantize::Input::INPUT)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); + + auto fn = std::make_unique<::arm_compute::NEDequantizationLayer>(); + +- fn->configure(input_alloc->handle(), output_alloc->handle()); ++ fn->configure(input_tensor->handle(), output_tensor->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -305,19 +318,19 @@ void KernelGenerator::visit(const ir::operation::MaxPool2D &node) + VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl; + VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + +- ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX, +- ::arm_compute::Size2D{kw, kh}, +- acl_common::asPadStrideInfo(padding, stride)}; ++ ::arm_compute::PoolingLayerInfo info{ ++ ::arm_compute::PoolingType::MAX, ::arm_compute::Size2D{kw, kh}, ++ ifm_tensor->info()->data_layout(), acl_common::asPadStrideInfo(padding, stride)}; + + auto fn = std::make_unique<::arm_compute::NEPoolingLayer>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info); + + _return_fn = std::make_unique( +- asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); ++ asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle())); + } + + void KernelGenerator::visit(const ir::operation::AvgPool2D &node) +@@ -348,19 +361,20 @@ void KernelGenerator::visit(const ir::operation::AvgPool2D &node) + VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl; + VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + + ::arm_compute::PoolingLayerInfo info{ + ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh}, +- acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */}; ++ ifm_tensor->info()->data_layout(), acl_common::asPadStrideInfo(padding, stride), ++ true /* exclude_padding */}; + + auto fn = std::make_unique<::arm_compute::NEPoolingLayer>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info); + + _return_fn = std::make_unique( +- asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); ++ asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle())); + } + + void KernelGenerator::visit(const ir::operation::Concat &node) +@@ -383,7 +397,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node) + return; + } + +- auto output_alloc = _tensor_builder->at(ofm_index).get(); ++ auto output_tensor = _tensor_builder->at(ofm_index).get(); + std::vector<::arm_compute::ITensor *> input_tensors; + for (const auto &ifm_ind : input_indexes) + input_tensors.emplace_back(_tensor_builder->at(ifm_ind)->handle()); +@@ -392,7 +406,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node) + if (input_indexes.size() < 2) + { + auto l = std::make_unique<::arm_compute::NECopy>(); +- l->configure(input_tensors.at(0), output_alloc->handle()); ++ l->configure(input_tensors.at(0), output_tensor->handle()); + fn = std::move(l); + } + else +@@ -400,10 +414,10 @@ void KernelGenerator::visit(const ir::operation::Concat &node) + auto l = std::make_unique<::arm_compute::NEConcatenateLayer>(); + const auto rank = _ctx.at(ofm_index).shape().rank(); + const auto frontend_layout = _current_op_seq_layout; +- const auto backend_layout = output_alloc->layout(); ++ const auto backend_layout = output_tensor->layout(); + const auto fixed_axis = + acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value(); +- l->configure(input_tensors, output_alloc->handle(), fixed_axis); ++ l->configure(input_tensors, output_tensor->handle(), fixed_axis); + fn = std::move(l); + } + +@@ -418,13 +432,13 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node) + const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)}; + const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto lookups_alloc = _tensor_builder->at(lookups_index).get(); +- auto values_alloc = _tensor_builder->at(values_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto lookups_tensor = _tensor_builder->at(lookups_index).get(); ++ auto values_tensor = _tensor_builder->at(values_index).get(); + + auto fn = std::make_unique<::arm_compute::NEEmbeddingLookup>(); + +- fn->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle()); ++ fn->configure(values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -436,12 +450,12 @@ void KernelGenerator::visit(const ir::operation::Floor &node) + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + + auto fn = std::make_unique<::arm_compute::NEFloor>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -450,76 +464,15 @@ void KernelGenerator::visit(const ir::operation::Floor &node) + + void KernelGenerator::visit(const ir::operation::FullyConnected &node) + { +- using ir::operation::FullyConnected; +- + const auto output_index{node.getOutputs().at(0)}; +- const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)}; +- const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)}; +- const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)}; +- +- const auto input_rank = _ctx.at(input_index).shape().rank(); +- +- const auto output_size = +- _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 1); +- UNUSED_RELEASE(output_size); +- assert(_ctx.at(bias_index).shape().dim(0) == output_size); +- assert(_ctx.at(weight_index).shape().dim(0) == output_size); +- const auto batch_size = +- _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 2); +- const auto input_size = +- _ctx.at(weight_index).shape().dim(_ctx.at(weight_index).shape().rank() - 1); +- +- // Check for reshaping input's shape into rank-2 +- bool needs_reshape = false; +- ir::Shape reshape(2); +- if (input_rank == 3 || input_rank == 4) +- { +- const auto &ifm_shape = _ctx.at(input_index).shape(); +- auto feature_size = 1; +- for (int i = 0; i < ifm_shape.rank(); ++i) +- { +- feature_size *= ifm_shape.dim(i); +- } +- +- UNUSED_RELEASE(feature_size); +- assert(feature_size == batch_size * input_size); +- +- // for reshaping +- needs_reshape = true; +- reshape.dim(0) = batch_size; /* H */ +- reshape.dim(1) = input_size; /* W */ +- } +- ++ auto output_tensor = _tensor_builder->at(output_index).get(); + const auto activation = node.param().activation; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- const auto input_alloc = _tensor_builder->at(input_index).get(); +- const auto weight_alloc = _tensor_builder->at(weight_index).get(); +- const auto bias_alloc = _tensor_builder->at(bias_index).get(); +- const auto frontend_layout = _current_op_seq_layout; +- const auto acl_layout = output_alloc->handle()->info()->data_layout(); +- +- auto fn = std::make_unique( +- _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); +- +- arm_compute::NEFullyConnectedReshapingLayer::KernelType kernel_type = +- arm_compute::NEFullyConnectedReshapingLayer::KernelType::GENERAL; +- if (_ctx.at(weight_index).isConstant()) +- { +- kernel_type = arm_compute::NEFullyConnectedReshapingLayer::KernelType::PREPROCESSED_WEIGHTS; +- assert(_ctx.at(weight_index).data()); +- } +- +- fn->configure( +- input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(), output_alloc->handle(), +- needs_reshape, +- ::onert::backend::acl_common::asTensorShape( +- reshape, frontend_layout, ::onert::backend::acl_common::asRuntimeLayout(acl_layout)), +- kernel_type); +- ++ auto fn = acl_common::kernelGenFullyConnected( ++ node, _ctx, _tensor_builder, _current_op_seq_layout); + _return_fn = std::make_unique( +- asAclFunction(std::move(fn)), +- ActivationBuilder::generate(activation, output_alloc->handle())); ++ std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle())); + } + + void KernelGenerator::visit(const ir::operation::HashtableLookup &node) +@@ -531,17 +484,17 @@ void KernelGenerator::visit(const ir::operation::HashtableLookup &node) + const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)}; + const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto hits_alloc = _tensor_builder->at(hits_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto hits_tensor = _tensor_builder->at(hits_index).get(); + +- auto lookups_alloc = _tensor_builder->at(lookups_index).get(); +- auto keys_alloc = _tensor_builder->at(keys_index).get(); +- auto values_alloc = _tensor_builder->at(values_index).get(); ++ auto lookups_tensor = _tensor_builder->at(lookups_index).get(); ++ auto keys_tensor = _tensor_builder->at(keys_index).get(); ++ auto values_tensor = _tensor_builder->at(values_index).get(); + + auto fn = std::make_unique<::arm_compute::NEHashtableLookup>(); + +- fn->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(), +- output_alloc->handle(), hits_alloc->handle()); ++ fn->configure(lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(), ++ output_tensor->handle(), hits_tensor->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -561,10 +514,10 @@ void KernelGenerator::visit(const ir::operation::Gather &node) + // Converting in reverse order + const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value(); + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); +- auto indices_alloc = _tensor_builder->at(indices_index).get(); +- const auto backend_layout = ofm_alloc->layout(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); ++ auto indices_tensor = _tensor_builder->at(indices_index).get(); ++ const auto backend_layout = ofm_tensor->layout(); + UNUSED_RELEASE(backend_layout); + + // NOTE The frontend layout and backend layout must be the same for this operation. +@@ -575,35 +528,35 @@ void KernelGenerator::visit(const ir::operation::Gather &node) + // a model. For example, if a model in NHWC has this operation as output rank == 4, indices + // rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W + // and C are not sequential in NCHW. So the backend in NCHW cannot handle this case. +- assert(backend_layout == ifm_alloc->layout()); +- assert(backend_layout == indices_alloc->layout()); ++ assert(backend_layout == ifm_tensor->layout()); ++ assert(backend_layout == indices_tensor->layout()); + assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout); + + auto fn = std::make_unique<::arm_compute::NEGatherEx>(); + + // input is n-D, indices k-D, output is (n + k - 1)-D + size_t n = ifm_rank; +- assert(n == ifm_alloc->num_dimensions()); ++ assert(n == ifm_tensor->num_dimensions()); + size_t k = _ctx.at(indices_index).shape().rank(); +- assert(k == indices_alloc->num_dimensions()); ++ assert(k == indices_tensor->num_dimensions()); + + // Disable applied dim_correction +- if (n != ifm_alloc->info()->num_dimensions()) ++ if (n != ifm_tensor->info()->num_dimensions()) + { + // This means that high dimension's value is 1 and ifm tensor is applied dim_correction + const auto ifm = _ctx.at(ifm_index); +- ifm_alloc->info()->set_tensor_shape( ++ ifm_tensor->info()->set_tensor_shape( + acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false)); + } +- if (k != indices_alloc->info()->num_dimensions()) ++ if (k != indices_tensor->info()->num_dimensions()) + { + // This means that high dimension's value is 1 and indices tensor is applied dim_correction + const auto indices = _ctx.at(indices_index); +- indices_alloc->info()->set_tensor_shape( ++ indices_tensor->info()->set_tensor_shape( + acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false)); + } + +- fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis); ++ fn->configure(ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis); + + // acl_neon doesn't not revert disabling applied dim_correction because acl_neon's kernels would + // use arm_compute::TensorInfo::offset_element_in_bytes() +@@ -621,20 +574,20 @@ void KernelGenerator::visit(const ir::operation::InstanceNorm &node) + const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)}; + const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); +- auto gamma_alloc = _tensor_builder->at(gamma_index).get(); +- auto beta_alloc = _tensor_builder->at(beta_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); ++ auto gamma_tensor = _tensor_builder->at(gamma_index).get(); ++ auto beta_tensor = _tensor_builder->at(beta_index).get(); + auto epsilon = node.param().epsilon; + auto activation = node.param().activation; + + auto fn = std::make_unique<::arm_compute::NEInstanceNormalizationLayerEx>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), gamma_alloc->handle(), +- beta_alloc->handle(), epsilon); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), ++ beta_tensor->handle(), epsilon); + + _return_fn = std::make_unique( +- asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); ++ asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle())); + } + + void KernelGenerator::visit(const ir::operation::L2Normalization &node) +@@ -656,15 +609,15 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node) + float beta = 0.5f; // pow(reduction, -0.5) = 1 / sqrt(reduction) + float bias = 0.0f; // Don't offset the reduction. + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + + const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP, + radius, alpha, beta, bias, false); + + auto fn = std::make_unique<::arm_compute::NENormalizationLayer>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -686,19 +639,20 @@ void KernelGenerator::visit(const ir::operation::L2Pool2D &node) + ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); + const auto activation = node.param().activation; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + + ::arm_compute::PoolingLayerInfo info{ + ::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh}, ++ ifm_tensor->info()->data_layout(), + ::onert::backend::acl_common::asPadStrideInfo(padding, stride)}; + + auto fn = std::make_unique<::arm_compute::NEPoolingLayer>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info); + + _return_fn = std::make_unique( +- asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); ++ asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle())); + } + + void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node) +@@ -712,15 +666,15 @@ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &nod + auto beta = node.param().beta; + auto bias = node.param().bias; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + + const auto norm_info = ::arm_compute::NormalizationLayerInfo( + ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false); + + auto fn = std::make_unique<::arm_compute::NENormalizationLayer>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -733,13 +687,13 @@ void KernelGenerator::visit(const ir::operation::LogicalAnd &node) + const auto input0_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT0)}; + const auto input1_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT1)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input0_alloc = _tensor_builder->at(input0_index).get(); +- auto input1_alloc = _tensor_builder->at(input1_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input0_tensor = _tensor_builder->at(input0_index).get(); ++ auto input1_tensor = _tensor_builder->at(input1_index).get(); + + auto fn = std::make_unique<::arm_compute::NELogicalAnd>(); + +- fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle()); ++ fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -751,12 +705,12 @@ void KernelGenerator::visit(const ir::operation::LogicalNot &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::LogicalNot::Input::INPUT)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); + + auto fn = std::make_unique<::arm_compute::NEBitwiseNot>(); + +- fn->configure(input_alloc->handle(), output_alloc->handle()); ++ fn->configure(input_tensor->handle(), output_tensor->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -769,13 +723,13 @@ void KernelGenerator::visit(const ir::operation::LogicalOr &node) + const auto input0_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT0)}; + const auto input1_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT1)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input0_alloc = _tensor_builder->at(input0_index).get(); +- auto input1_alloc = _tensor_builder->at(input1_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input0_tensor = _tensor_builder->at(input0_index).get(); ++ auto input1_tensor = _tensor_builder->at(input1_index).get(); + + auto fn = std::make_unique<::arm_compute::NELogicalOr>(); + +- fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle()); ++ fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -787,8 +741,8 @@ void KernelGenerator::visit(const ir::operation::Logistic &node) + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC}; +@@ -798,7 +752,7 @@ void KernelGenerator::visit(const ir::operation::Logistic &node) + // instead of 'INF', and then the result of this op will be errors due to the 'NaN'. + auto fn = std::make_unique<::arm_compute::NEActivationLayerEx>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -807,159 +761,8 @@ void KernelGenerator::visit(const ir::operation::Logistic &node) + + void KernelGenerator::visit(const ir::operation::LSTM &node) + { +- // TODO Support dynamic rnn +- // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection. +- const auto scratch_buffer_index{ +- node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)}; +- const auto output_state_out_index{ +- node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)}; +- const auto cell_state_out_index{ +- node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)}; +- const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)}; +- +- const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)}; +- const auto input_to_input_weights_index{ +- node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional +- const auto input_to_forget_weights_index{ +- node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)}; +- const auto input_to_cell_weights_index{ +- node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)}; +- const auto input_to_output_weights_index{ +- node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)}; +- const auto recurrent_to_input_weights_index{ +- node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional +- const auto recurrent_to_forget_weights_index{ +- node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)}; +- const auto recurrent_to_cell_weights_index{ +- node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)}; +- const auto recurrent_to_output_weights_index{ +- node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)}; +- const auto cell_to_input_weights_index{ +- node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional +- const auto cell_to_forget_weights_index{ +- node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional +- const auto cell_to_output_weights_index{ +- node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional +- const auto input_gate_bias_index{ +- node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)}; +- const auto forget_gate_bias_index{ +- node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)}; +- const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)}; +- const auto output_gate_bias_index{ +- node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)}; +- const auto projection_weights_index{ +- node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional +- const auto projection_bias_index{ +- node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional +- const auto output_state_in_index{ +- node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)}; +- const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)}; +- const auto cell_threshold = node.param().cell_threshold; +- const auto projection_threshold = node.param().projection_threshold; +- +- bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 && +- _ctx.at(input_to_input_weights_index).shape().dim(1) != 0; +- bool has_recurrent_to_input_weights = +- _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 && +- _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0; +- bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0; +- bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0; +- bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 && +- _ctx.at(projection_weights_index).shape().dim(1) != 0; +- bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0); +- +- // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG. +- // true: no CIFG +- // false: CIFG +- // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG). +- bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights; +- +- // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole. +- // But the cell_to_input_weights does not exist in regular CIFG although peephole. +- // true: peephole +- // false: no peephole +- bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights; +- +- // NOTE Although the projection weights has data the projection bias may not have data. +- bool has_projection_param = has_projection_weights; +- +- const auto activation = node.param().activation; +- const auto cell_clip = cell_threshold; +- const auto projection_clip = projection_threshold; +- assert(cell_clip >= 0.f && projection_clip >= 0.f); +- +- auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get(); +- auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get(); +- auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get(); +- auto output_alloc = _tensor_builder->at(output_index).get(); +- +- auto input_alloc = _tensor_builder->at(input_index).get(); +- +- auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get(); +- auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get(); +- auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get(); +- auto recurrent_to_forget_weights_alloc = +- _tensor_builder->at(recurrent_to_forget_weights_index).get(); +- auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get(); +- auto recurrent_to_output_weights_alloc = +- _tensor_builder->at(recurrent_to_output_weights_index).get(); +- +- auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get(); +- auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get(); +- auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get(); +- auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get(); +- auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get(); +- +- auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation); +- +- auto fn = std::make_unique<::arm_compute::NELSTMLayer>(); +- +- ::arm_compute::LSTMParams<::arm_compute::ITensor> lstm_params{}; +- if (has_cifg_param) +- { +- auto input_to_input_weights_alloc = +- _tensor_builder->at(input_to_input_weights_index).get(); // optional +- auto recurrent_to_input_weights_alloc = +- _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional +- auto cell_to_input_weights_handle = +- has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle() +- : nullptr; // optional (non-cifg && peephole) +- auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional +- lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(), +- recurrent_to_input_weights_alloc->handle(), +- cell_to_input_weights_handle, input_gate_bias_alloc->handle()); +- } +- if (has_peephole_param) +- { +- auto cell_to_forget_weights_alloc = +- _tensor_builder->at(cell_to_forget_weights_index).get(); // optional +- auto cell_to_output_weights_alloc = +- _tensor_builder->at(cell_to_output_weights_index).get(); // optional +- lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(), +- cell_to_output_weights_alloc->handle()); +- } +- if (has_projection_param) +- { +- auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional +- auto projection_bias_handle = has_projection_bias +- ? _tensor_builder->at(projection_bias_index).get()->handle() +- : nullptr; // optional +- lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle); +- } +- +- fn->configure( +- input_alloc->handle(), input_to_forget_weights_alloc->handle(), +- input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(), +- recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(), +- recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(), +- cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(), +- cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(), +- output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(), +- lstm_params, act_info, cell_clip, projection_clip); +- +- auto acl_fn = asAclFunction(std::move(fn)); +- +- _return_fn = std::move(acl_fn); ++ _return_fn = acl_common::kernelGenLSTM(node, _ctx, _tensor_builder); + } + + void KernelGenerator::visit(const ir::operation::Mul &node) +@@ -970,18 +773,18 @@ void KernelGenerator::visit(const ir::operation::Mul &node) + + const auto activation = node.param().activation; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto lhs_alloc = _tensor_builder->at(lhs_index).get(); +- auto rhs_alloc = _tensor_builder->at(rhs_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto lhs_tensor = _tensor_builder->at(lhs_index).get(); ++ auto rhs_tensor = _tensor_builder->at(rhs_index).get(); + + auto fn = std::make_unique<::arm_compute::NEPixelWiseMultiplication>(); + + // RoundingPolicy for scale:1.0 is only allowed RoundingPolicy::TO_ZERO +- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale ++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale + arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO); + + _return_fn = std::make_unique( +- asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); ++ asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle())); + } + + void KernelGenerator::visit(const ir::operation::Neg &node) +@@ -989,12 +792,12 @@ void KernelGenerator::visit(const ir::operation::Neg &node) + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + + auto fn = std::make_unique<::arm_compute::NENegLayer>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -1030,12 +833,12 @@ void KernelGenerator::visit(const ir::operation::Pack &node) + for (const auto &input_index : input_indexes) + { + size_t input_rank = _ctx.at(input_index).shape().rank(); +- const auto &input_alloc = _tensor_builder->at(input_index); +- assert(input_rank == input_alloc->num_dimensions()); +- if (input_rank != input_alloc->info()->num_dimensions()) ++ const auto &input_tensor = _tensor_builder->at(input_index); ++ assert(input_rank == input_tensor->num_dimensions()); ++ if (input_rank != input_tensor->info()->num_dimensions()) + { + // This means that high dimension's value is 1 and ifm tensor is applied dim_correction +- input_alloc->info()->set_tensor_shape(acl_common::asTensorShape( ++ input_tensor->info()->set_tensor_shape(acl_common::asTensorShape( + _ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false)); + } + } +@@ -1094,8 +897,8 @@ void KernelGenerator::visit(const ir::operation::Permute &node) + const auto ofm_idx{node.getOutputs().at(0)}; + const auto ifm_idx{node.getInputs().at(0)}; + const auto permute_type = node.getPermuteType(); +- auto ofm_alloc = _tensor_builder->at(ofm_idx).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_idx).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_idx).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_idx).get(); + const auto rank = _ctx.at(ofm_idx).shape().rank(); + assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank()); + +@@ -1108,7 +911,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node) + + auto l = std::make_unique<::arm_compute::NEPermute>(); + +- l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv); ++ l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv); + + fn = std::move(l); + } +@@ -1119,7 +922,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node) + + auto l = std::make_unique<::arm_compute::NEPermute>(); + +- l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv); ++ l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv); + + fn = std::move(l); + } +@@ -1127,7 +930,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node) + { + auto l = std::make_unique<::arm_compute::NECopy>(); + +- l->configure(ifm_alloc->handle(), ofm_alloc->handle()); ++ l->configure(ifm_tensor->handle(), ofm_tensor->handle()); + + fn = std::move(l); + } +@@ -1143,15 +946,15 @@ void KernelGenerator::visit(const ir::operation::PReLU &node) + const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)}; + const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); +- auto alpha_alloc = _tensor_builder->at(alpha_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); ++ auto alpha_tensor = _tensor_builder->at(alpha_index).get(); + + std::unique_ptr<::arm_compute::IFunction> fn; + +- auto l = std::make_unique<::arm_compute::NEPReLU>(); ++ auto l = std::make_unique<::arm_compute::NEPReluLayer>(); + +- l->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle()); ++ l->configure(ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle()); + + fn = std::move(l); + +@@ -1166,14 +969,14 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) + const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)}; + const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); + + // Convert to ACL axes taking into account negative values and possible duplicates. + const auto &axes = _ctx.at(axes_index); + const auto input_rank = _ctx.at(input_index).shape().rank(); + const auto frontend_layout = _current_op_seq_layout; +- const auto backend_layout = input_alloc->layout(); ++ const auto backend_layout = input_tensor->layout(); + const auto reduce_axes = + acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout); + const auto reduce_type = node.param().reduce_type; +@@ -1182,11 +985,9 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) + std::unique_ptr<::arm_compute::IFunction> fn; + if (reduce_type == ir::operation::Reduce::ReduceType::MEAN) + { +- // NOTE NEReduceMean has a bug that does not support NHWC layout +- // NEReduceMean intermediate tensors are always NCHW layout +- auto l = std::make_unique<::arm_compute::NEReduceMeanEx>(); ++ auto l = std::make_unique<::arm_compute::NEReduceMean>(); + +- l->configure(input_alloc->handle(), reduce_axes, keep_dims, output_alloc->handle()); ++ l->configure(input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle()); + + fn = std::move(l); + } +@@ -1194,7 +995,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) + { + auto l = std::make_unique<::arm_compute::NEReduceSum>(); + +- l->configure(input_alloc->handle(), reduce_axes, keep_dims, output_alloc->handle()); ++ l->configure(input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle()); + + fn = std::move(l); + } +@@ -1202,7 +1003,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) + { + auto l = std::make_unique<::arm_compute::NEReduceOperation>(); + +- l->configure(input_alloc->handle(), reduce_axes, keep_dims, output_alloc->handle(), ++ l->configure(input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle(), + acl_common::convertReduceType(reduce_type)); + + fn = std::move(l); +@@ -1218,15 +1019,15 @@ void KernelGenerator::visit(const ir::operation::ReLU &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::ReLU::Input::INPUT)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); + + auto fn = std::make_unique(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU}; + +- fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); ++ fn->configure(input_tensor->handle(), output_tensor->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -1238,15 +1039,15 @@ void KernelGenerator::visit(const ir::operation::ReLU1 &node) + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::ReLU1::Input::INPUT)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f}; + + auto fn = std::make_unique<::arm_compute::NEActivationLayer>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -1258,15 +1059,15 @@ void KernelGenerator::visit(const ir::operation::ReLU6 &node) + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::ReLU6::Input::INPUT)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f}; + + auto fn = std::make_unique<::arm_compute::NEActivationLayer>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -1278,13 +1079,13 @@ void KernelGenerator::visit(const ir::operation::Reshape &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); + + // NOTE This operation must not be changed the layout from frontend to backend + // So, PermutationOperationPass makes layouts of frontend and backend the same. + const auto frontend_layout = _current_op_seq_layout; +- const auto backend_layout = output_alloc->layout(); ++ const auto backend_layout = output_tensor->layout(); + assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) || + frontend_layout == backend_layout); + UNUSED_RELEASE(frontend_layout); +@@ -1292,7 +1093,7 @@ void KernelGenerator::visit(const ir::operation::Reshape &node) + + auto fn = std::make_unique(); + +- fn->configure(input_alloc->handle(), output_alloc->handle()); ++ fn->configure(input_tensor->handle(), output_tensor->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -1305,12 +1106,12 @@ void KernelGenerator::visit(const ir::operation::ResizeBilinear &node) + + const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + + auto fn = std::make_unique<::arm_compute::NEScale>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), + ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE, + ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT); + +@@ -1334,25 +1135,25 @@ void KernelGenerator::visit(const ir::operation::RNN &node) + + const auto activation = node.param().activation; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto hidden_state_out_tensor = _tensor_builder->at(hidden_state_out_index).get(); + +- auto input_alloc = _tensor_builder->at(input_index).get(); +- auto weights_alloc = _tensor_builder->at(weights_index).get(); +- auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get(); +- auto bias_alloc = _tensor_builder->at(bias_index).get(); +- auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); ++ auto weights_tensor = _tensor_builder->at(weights_index).get(); ++ auto recurrent_weights_tensor = _tensor_builder->at(recurrent_weights_index).get(); ++ auto bias_tensor = _tensor_builder->at(bias_index).get(); ++ auto hidden_state_in_tensor = _tensor_builder->at(hidden_state_in_index).get(); + auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation); + + auto copy_layer = std::make_unique<::arm_compute::NECopy>(); +- copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle()); ++ copy_layer->configure(hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle()); + _return_fn = asAclFunction(std::move(copy_layer)); + +- auto fn = std::make_unique<::arm_compute::NERNNLayerEx>( ++ auto fn = std::make_unique<::arm_compute::NERNNLayer>( + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); +- fn->configure(input_alloc->handle(), weights_alloc->handle(), recurrent_weights_alloc->handle(), +- bias_alloc->handle(), hidden_state_out_alloc->handle(), output_alloc->handle(), +- act_info); ++ fn->configure(input_tensor->handle(), weights_tensor->handle(), ++ recurrent_weights_tensor->handle(), bias_tensor->handle(), ++ hidden_state_out_tensor->handle(), output_tensor->handle(), act_info); + _return_fn = asAclFunction(std::move(fn)); + } + +@@ -1361,12 +1162,12 @@ void KernelGenerator::visit(const ir::operation::RSQRT &node) + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + + auto fn = std::make_unique<::arm_compute::NERsqrtLayer>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle()); + + _return_fn = asAclFunction(std::move(fn)); + } +@@ -1383,10 +1184,10 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node) + (void)dims; + (void)ndim; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); + auto fn = std::make_unique(); +- fn->configure(input_alloc->handle(), output_alloc->handle()); ++ fn->configure(input_tensor->handle(), output_tensor->handle()); + auto acl_fn = asAclFunction(std::move(fn)); + _return_fn = std::move(acl_fn); + } +@@ -1396,15 +1197,15 @@ void KernelGenerator::visit(const ir::operation::Tanh &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); + + auto fn = std::make_unique(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f}; + +- fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); ++ fn->configure(input_tensor->handle(), output_tensor->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -1417,13 +1218,25 @@ void KernelGenerator::visit(const ir::operation::Softmax &node) + const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)}; + const auto beta = node.param().beta; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); ++ const auto frontend_layout = _current_op_seq_layout; ++ const auto backend_layout = input_tensor->layout(); ++ ++ // Disable applied dim_correction ++ const size_t input_rank = _ctx.at(input_index).shape().rank(); ++ if (input_rank != input_tensor->info()->num_dimensions()) ++ { ++ // This means that high dimension's value is 1 and input tensor is applied dim_correction ++ const auto input = _ctx.at(input_index); ++ input_tensor->info()->set_tensor_shape( ++ acl_common::asTensorShape(input.shape(), frontend_layout, backend_layout, false)); ++ } + + auto fn = std::make_unique<::arm_compute::NESoftmaxLayer>( + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + +- fn->configure(input_alloc->handle(), output_alloc->handle(), beta); ++ fn->configure(input_tensor->handle(), output_tensor->handle(), beta); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -1438,20 +1251,18 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node) + node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)}; + const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); +- auto block_size_alloc = _tensor_builder->at(block_size_index).get(); +- auto paddings_alloc = _tensor_builder->at(paddings_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); ++ auto block_size_tensor = _tensor_builder->at(block_size_index).get(); ++ auto paddings_tensor = _tensor_builder->at(paddings_index).get(); + + assert(_ctx.at(block_size_index).data()); + assert(_ctx.at(paddings_index).data()); + +- // NESpaceToBatchLayer has a bug that padding's values are 0 even when zero point of QASYMM8 is +- // not 0. +- auto fn = std::make_unique<::arm_compute::NESpaceToBatchLayerEx>(); ++ auto fn = std::make_unique<::arm_compute::NESpaceToBatchLayer>(); + +- fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(), +- ofm_alloc->handle()); ++ fn->configure(ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(), ++ ofm_tensor->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -1465,12 +1276,12 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node) + + auto block_size = node.param().block_size; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + +- auto fn = std::make_unique<::arm_compute::NESpaceToDepthLayerEx>(); ++ auto fn = std::make_unique<::arm_compute::NESpaceToDepthLayer>(); + +- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size); ++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), block_size); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -1489,13 +1300,13 @@ void KernelGenerator::visit(const ir::operation::Split &node) + for (const auto &output : node.getOutputs()) + output_indexes.emplace_back(output); + +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); +- std::vector output_allocs; ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); ++ std::vector output_tensors; + for (const auto &ofm_ind : output_indexes) +- output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle()); ++ output_tensors.emplace_back(_tensor_builder->at(ofm_ind).get()->handle()); + + const auto frontend_layout = _current_op_seq_layout; +- const auto backend_layout = ifm_alloc->layout(); ++ const auto backend_layout = ifm_tensor->layout(); + auto axis = node.param().axis; + if (axis < 0) + axis += ifm_rank; +@@ -1503,7 +1314,7 @@ void KernelGenerator::visit(const ir::operation::Split &node) + + auto fn = std::make_unique<::arm_compute::NESplit>(); + +- fn->configure(ifm_alloc->handle(), output_allocs, axis); ++ fn->configure(ifm_tensor->handle(), output_tensors, axis); + + _return_fn = asAclFunction(std::move(fn)); + } +@@ -1513,15 +1324,15 @@ void KernelGenerator::visit(const ir::operation::SQRT &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::SQRT::Input::INPUT)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT}; + + auto fn = std::make_unique<::arm_compute::NEActivationLayer>(); + +- fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); ++ fn->configure(input_tensor->handle(), output_tensor->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -1534,13 +1345,13 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node) + const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto lhs_alloc = _tensor_builder->at(lhs_index).get(); +- auto rhs_alloc = _tensor_builder->at(rhs_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto lhs_tensor = _tensor_builder->at(lhs_index).get(); ++ auto rhs_tensor = _tensor_builder->at(rhs_index).get(); + + auto fn = std::make_unique<::arm_compute::NEElementwiseSquaredDiff>(); + +- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); ++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -1555,17 +1366,17 @@ void KernelGenerator::visit(const ir::operation::Sub &node) + + const auto activation = node.param().activation; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto lhs_alloc = _tensor_builder->at(lhs_index).get(); +- auto rhs_alloc = _tensor_builder->at(rhs_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto lhs_tensor = _tensor_builder->at(lhs_index).get(); ++ auto rhs_tensor = _tensor_builder->at(rhs_index).get(); + + auto fn = std::make_unique<::arm_compute::NEArithmeticSubtraction>(); + +- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), ++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), + arm_compute::ConvertPolicy::SATURATE); + + _return_fn = std::make_unique( +- asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); ++ asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle())); + } + + void KernelGenerator::visit(const ir::operation::Slice &node) +@@ -1575,10 +1386,10 @@ void KernelGenerator::visit(const ir::operation::Slice &node) + const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)}; + const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)}; + +- auto outputData_alloc = _tensor_builder->at(output_index).get(); +- auto inputData_alloc = _tensor_builder->at(input_index).get(); ++ auto outputData_tensor = _tensor_builder->at(output_index).get(); ++ auto inputData_tensor = _tensor_builder->at(input_index).get(); + const auto frontend_layout = _current_op_seq_layout; +- const auto backend_layout = inputData_alloc->layout(); ++ const auto backend_layout = inputData_tensor->layout(); + + // Set initializers for indices data such as order of inputData + int input_rank = _ctx.at(input_index).shape().rank(); +@@ -1628,7 +1439,7 @@ void KernelGenerator::visit(const ir::operation::Slice &node) + + auto fn = std::make_unique<::arm_compute::NESlice>(); + +- fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set); ++ fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -1643,10 +1454,10 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node) + const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)}; + const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)}; + +- auto outputData_alloc = _tensor_builder->at(output_index).get(); +- auto inputData_alloc = _tensor_builder->at(input_index).get(); ++ auto outputData_tensor = _tensor_builder->at(output_index).get(); ++ auto inputData_tensor = _tensor_builder->at(input_index).get(); + const auto frontend_layout = _current_op_seq_layout; +- const auto backend_layout = inputData_alloc->layout(); ++ const auto backend_layout = inputData_tensor->layout(); + + // Set initializers for indices data such as order of inputData + int input_rank = _ctx.at(input_index).shape().rank(); +@@ -1715,7 +1526,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node) + + auto fn = std::make_unique<::arm_compute::NEStridedSlice>(); + +- fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set, ++ fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, + strides_set, begin_mask, end_mask, shrink_axis_mask); + + auto acl_fn = asAclFunction(std::move(fn)); +@@ -1749,16 +1560,16 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node) + invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1); + } + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->at(ifm_index).get(); +- auto ker_alloc = _tensor_builder->at(ker_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->at(ifm_index).get(); ++ auto ker_tensor = _tensor_builder->at(ker_index).get(); + + const auto tconv_info = acl_common::asPadStrideInfo(padding, stride); + + auto fn = std::make_unique<::arm_compute::NETransposeConvLayer>(); + +- fn->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info, +- invalid_horizontal, invalid_vertical); ++ fn->configure(ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(), ++ tconv_info, invalid_horizontal, invalid_vertical); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -1771,10 +1582,10 @@ void KernelGenerator::visit(const ir::operation::Transpose &node) + const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)}; + const auto &perm{node.param().perm}; + +- auto ofm_alloc = _tensor_builder->at(ofm_idx).get(); +- const auto ifm_alloc = _tensor_builder->at(ifm_idx).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_idx).get(); ++ const auto ifm_tensor = _tensor_builder->at(ifm_idx).get(); + const auto frontend_layout = _current_op_seq_layout; +- const auto backend_layout = ifm_alloc->layout(); ++ const auto backend_layout = ifm_tensor->layout(); + + const auto rank = _ctx.at(ifm_idx).shape().rank(); + std::vector pv(perm.cbegin(), perm.cend()); +@@ -1783,11 +1594,11 @@ void KernelGenerator::visit(const ir::operation::Transpose &node) + + std::unique_ptr<::arm_compute::IFunction> fn; + +- if (ifm_alloc->num_dimensions() <= 2 && ofm_alloc->num_dimensions() <= 2) ++ if (ifm_tensor->num_dimensions() <= 2 && ofm_tensor->num_dimensions() <= 2) + { + auto l = std::make_unique<::arm_compute::NETranspose>(); + +- l->configure(ifm_alloc->handle(), ofm_alloc->handle()); ++ l->configure(ifm_tensor->handle(), ofm_tensor->handle()); + + fn = std::move(l); + } +@@ -1795,7 +1606,7 @@ void KernelGenerator::visit(const ir::operation::Transpose &node) + { + auto l = std::make_unique<::arm_compute::NEPermute>(); + +- l->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv); ++ l->configure(ifm_tensor->handle(), ofm_tensor->handle(), backend_pv); + + fn = std::move(l); + } +@@ -1834,13 +1645,13 @@ void KernelGenerator::visit(const ir::operation::Unpack &node) + for (const auto &output_index : output_indexes) + { + size_t output_rank = _ctx.at(output_index).shape().rank(); +- const auto &output_alloc = _tensor_builder->at(output_index); +- orig_outputs_acl_tensor_shapes.emplace_back(output_alloc->info()->tensor_shape()); +- assert(output_rank == output_alloc->num_dimensions()); +- if (output_rank != output_alloc->info()->num_dimensions()) ++ const auto &output_tensor = _tensor_builder->at(output_index); ++ orig_outputs_acl_tensor_shapes.emplace_back(output_tensor->info()->tensor_shape()); ++ assert(output_rank == output_tensor->num_dimensions()); ++ if (output_rank != output_tensor->info()->num_dimensions()) + { + // This means that high dimension's value is 1 and ifm tensor is applied dim_correction +- output_alloc->info()->set_tensor_shape(acl_common::asTensorShape( ++ output_tensor->info()->set_tensor_shape(acl_common::asTensorShape( + _ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false)); + } + } +@@ -1858,17 +1669,17 @@ void KernelGenerator::visit(const ir::operation::Add &node) + + const auto activation = node.param().activation; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto lhs_alloc = _tensor_builder->at(lhs_index).get(); +- auto rhs_alloc = _tensor_builder->at(rhs_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto lhs_tensor = _tensor_builder->at(lhs_index).get(); ++ auto rhs_tensor = _tensor_builder->at(rhs_index).get(); + + auto fn = std::make_unique<::arm_compute::NEArithmeticAddition>(); + +- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), ++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), + arm_compute::ConvertPolicy::SATURATE); + + _return_fn = std::make_unique( +- asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); ++ asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle())); + } + + void KernelGenerator::visit(const ir::operation::Div &node) +@@ -1879,16 +1690,16 @@ void KernelGenerator::visit(const ir::operation::Div &node) + + const auto activation = node.param().activation; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto lhs_alloc = _tensor_builder->at(lhs_index).get(); +- auto rhs_alloc = _tensor_builder->at(rhs_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto lhs_tensor = _tensor_builder->at(lhs_index).get(); ++ auto rhs_tensor = _tensor_builder->at(rhs_index).get(); + + auto fn = std::make_unique<::arm_compute::NEElementwiseDivision>(); + +- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); ++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); + + _return_fn = std::make_unique( +- asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); ++ asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle())); + } + + void KernelGenerator::visit(const ir::operation::Exp &node) +@@ -1896,12 +1707,12 @@ void KernelGenerator::visit(const ir::operation::Exp &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); + + auto fn = std::make_unique<::arm_compute::NEExpLayer>(); + +- fn->configure(input_alloc->handle(), output_alloc->handle()); ++ fn->configure(input_tensor->handle(), output_tensor->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -1913,12 +1724,12 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)}; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input_tensor = _tensor_builder->at(input_index).get(); + + auto fn = std::make_unique<::arm_compute::NEReshapeLayer>(); + +- fn->configure(input_alloc->handle(), output_alloc->handle()); ++ fn->configure(input_tensor->handle(), output_tensor->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -1933,13 +1744,13 @@ void KernelGenerator::visit(const ir::operation::Comparison &node) + + const auto comparison_type = node.param().comparison_type; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input0_alloc = _tensor_builder->at(input0_index).get(); +- auto input1_alloc = _tensor_builder->at(input1_index).get(); ++ auto output_tensor = _tensor_builder->at(output_index).get(); ++ auto input0_tensor = _tensor_builder->at(input0_index).get(); ++ auto input1_tensor = _tensor_builder->at(input1_index).get(); + + auto fn = std::make_unique<::arm_compute::NEElementwiseComparison>(); + +- fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(), ++ fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(), + (arm_compute::ComparisonOperation)comparison_type); + + auto acl_fn = asAclFunction(std::move(fn)); +@@ -1953,13 +1764,13 @@ void KernelGenerator::visit(const ir::operation::Min &node) + const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto lhs_alloc = _tensor_builder->at(lhs_index).get(); +- auto rhs_alloc = _tensor_builder->at(rhs_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto lhs_tensor = _tensor_builder->at(lhs_index).get(); ++ auto rhs_tensor = _tensor_builder->at(rhs_index).get(); + + auto fn = std::make_unique<::arm_compute::NEElementwiseMin>(); + +- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); ++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + +@@ -1972,13 +1783,13 @@ void KernelGenerator::visit(const ir::operation::Max &node) + const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)}; + +- auto ofm_alloc = _tensor_builder->at(ofm_index).get(); +- auto lhs_alloc = _tensor_builder->at(lhs_index).get(); +- auto rhs_alloc = _tensor_builder->at(rhs_index).get(); ++ auto ofm_tensor = _tensor_builder->at(ofm_index).get(); ++ auto lhs_tensor = _tensor_builder->at(lhs_index).get(); ++ auto rhs_tensor = _tensor_builder->at(rhs_index).get(); + + auto fn = std::make_unique<::arm_compute::NEElementwiseMax>(); + +- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); ++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + +diff --git a/runtime/onert/backend/cpu/ConstantInitializer.cc b/runtime/onert/backend/cpu/ConstantInitializer.cc +index 71e3136..deb27f0 100644 +--- a/runtime/onert/backend/cpu/ConstantInitializer.cc ++++ b/runtime/onert/backend/cpu/ConstantInitializer.cc +@@ -15,6 +15,7 @@ + */ + + #include "ConstantInitializer.h" ++#include "Tensor.h" + + namespace onert + { +@@ -30,39 +31,61 @@ ConstantInitializer::ConstantInitializer(const ir::Operands &operands, + // DO NOTHING + } + ++void ConstantInitializer::registerDefaultInitializer(const ir::OperandIndex &index, ++ const ir::Operand &obj) ++{ ++ registerExternalInitializer(index, obj); ++} ++ ++void ConstantInitializer::registerExternalInitializer(const ir::OperandIndex &index, ++ const ir::Operand &obj) ++{ ++ // For only CONSTANTS ++ // TODO Add to check if tensor has been allocated ++ if (!obj.isConstant()) ++ return; ++ ++ _init_map[index] = [](const onert::ir::Operand &model_obj, onert::backend::ITensor &itensor) { ++ auto data = model_obj.shareData(); ++ assert(data && data->base()); ++ ExternalTensor &tensor = dynamic_cast(itensor); ++ tensor.setData(data); ++ }; ++} ++ + void ConstantInitializer::visit(const ir::operation::Conv2D &node) + { + const auto &kernel_index = node.getInputs().at(ir::operation::Conv2D::KERNEL); + const auto &kernel_obj = _operands.at(kernel_index); +- registerCopyInitializer(kernel_index, kernel_obj); ++ registerExternalInitializer(kernel_index, kernel_obj); + + const auto &bias_index = node.getInputs().at(ir::operation::Conv2D::BIAS); + const auto &bias_obj = _operands.at(bias_index); +- registerCopyInitializer(bias_index, bias_obj); ++ registerExternalInitializer(bias_index, bias_obj); + } + + void ConstantInitializer::visit(const ir::operation::DepthwiseConv2D &node) + { + const auto &kernel_index = node.getInputs().at(ir::operation::DepthwiseConv2D::KERNEL); + const auto &kernel_obj = _operands.at(kernel_index); +- registerCopyInitializer(kernel_index, kernel_obj); ++ registerExternalInitializer(kernel_index, kernel_obj); + + const auto &bias_index = node.getInputs().at(ir::operation::DepthwiseConv2D::BIAS); + const auto &bias_obj = _operands.at(bias_index); +- registerCopyInitializer(bias_index, bias_obj); ++ registerExternalInitializer(bias_index, bias_obj); + } + + void ConstantInitializer::visit(const ir::operation::FullyConnected &node) + { + const auto &weight_index = node.getInputs().at(ir::operation::FullyConnected::WEIGHT); + const auto &weight_obj = _operands.at(weight_index); +- registerCopyInitializer(weight_index, weight_obj); ++ registerExternalInitializer(weight_index, weight_obj); + + const auto &bias_index = node.getInputs().at(ir::operation::FullyConnected::BIAS); + if (!bias_index.undefined()) + { + const auto &bias_obj = _operands.at(bias_index); +- registerCopyInitializer(bias_index, bias_obj); ++ registerExternalInitializer(bias_index, bias_obj); + } + } + +diff --git a/runtime/onert/backend/cpu/ConstantInitializer.h b/runtime/onert/backend/cpu/ConstantInitializer.h +index bd06c64..de03a69 100644 +--- a/runtime/onert/backend/cpu/ConstantInitializer.h ++++ b/runtime/onert/backend/cpu/ConstantInitializer.h +@@ -36,6 +36,15 @@ public: + const std::shared_ptr &tensor_builder); + + public: ++ void registerDefaultInitializer(const ir::OperandIndex &index, const ir::Operand &obj) override; ++ ++ // TODO: For now the only cpu backend supports constant tensor to use data from external ++ // If the other backend supports (to do this, ++ // ExternalTensor should be abstract such as IExternal, maybe), ++ // this can be an interface of IConstantInitializer ++ void registerExternalInitializer(const ir::OperandIndex &, const ir::Operand &); ++ ++public: + void visit(const ir::operation::Conv2D &) override; + void visit(const ir::operation::DepthwiseConv2D &) override; + void visit(const ir::operation::FullyConnected &) override; +diff --git a/runtime/onert/backend/cpu/KernelGenerator.cc b/runtime/onert/backend/cpu/KernelGenerator.cc +index 72f9606..2766aa2 100644 +--- a/runtime/onert/backend/cpu/KernelGenerator.cc ++++ b/runtime/onert/backend/cpu/KernelGenerator.cc +@@ -60,6 +60,7 @@ + #include "ops/SoftMaxLayer.h" + #include "ops/StridedSliceLayer.h" + #include "ops/SpaceToBatchNDLayer.h" ++#include "ops/SpaceToDepthLayer.h" + #include "ops/SplitLayer.h" + #include "ops/SubLayer.h" + #include "ops/TanhLayer.h" +@@ -70,11 +71,13 @@ + #include "ops/ZerosLikeLayer.h" + #include "ops/SquaredDiffLayer.h" + #include "ops/LogicalOrLayer.h" ++#include "ops/L2NormLayer.h" + #include "ops/MatrixBandPartLayer.h" + #include "ops/BatchMatMulLayer.h" + #include "ops/BroadcastToLayer.h" + #include "ops/FusedBatchNormLayer.h" + #include "ops/LogSoftMaxLayer.h" ++#include "ops/QuantizeLayer.h" + + #include + #include +@@ -184,10 +187,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) + const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)}; + const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)}; + +- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); +- auto ker_alloc = _tensor_builder->portableAt(ker_index).get(); +- auto bias_alloc = _tensor_builder->portableAt(bias_index).get(); ++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); ++ auto ker_tensor = _tensor_builder->portableAt(ker_index).get(); ++ auto bias_tensor = _tensor_builder->portableAt(bias_index).get(); + + const auto stride = node.param().stride; + const auto activation = node.param().activation; +@@ -196,9 +199,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) + + if (_ctx.at(ifm_index).info().isDynamic() || _ctx.at(ker_index).info().isDynamic()) + { +- fn->configure(ifm_alloc, ker_alloc, bias_alloc, param_padding.type, param_padding.param.left, ++ fn->configure(ifm_tensor, ker_tensor, bias_tensor, param_padding.type, param_padding.param.left, + param_padding.param.right, param_padding.param.top, param_padding.param.bottom, +- stride.horizontal, stride.vertical, activation, ofm_alloc); ++ stride.horizontal, stride.vertical, activation, ofm_tensor); + + _return_fn = std::move(fn); + return; +@@ -213,9 +216,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) + const auto padding = + ir::calculatePadding(param_padding, ifm_shape, ofm_shape, stride, ker_width, ker_height); + +- fn->configure(ifm_alloc, ker_alloc, bias_alloc, param_padding.type, padding.left, padding.right, +- padding.top, padding.bottom, stride.horizontal, stride.vertical, activation, +- ofm_alloc); ++ fn->configure(ifm_tensor, ker_tensor, bias_tensor, param_padding.type, padding.left, ++ padding.right, padding.top, padding.bottom, stride.horizontal, stride.vertical, ++ activation, ofm_tensor); + + _return_fn = std::move(fn); + } +@@ -241,16 +244,16 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) + const auto multiplier = node.param().multiplier; + const auto activation = node.param().activation; + +- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); +- auto ker_alloc = _tensor_builder->portableAt(ker_index).get(); +- auto bias_alloc = _tensor_builder->portableAt(bias_index).get(); ++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); ++ auto ker_tensor = _tensor_builder->portableAt(ker_index).get(); ++ auto bias_tensor = _tensor_builder->portableAt(bias_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(ifm_alloc, ker_alloc, bias_alloc, padding.left, padding.right, padding.top, ++ fn->configure(ifm_tensor, ker_tensor, bias_tensor, padding.left, padding.right, padding.top, + padding.bottom, stride.horizontal, stride.vertical, multiplier, activation, +- ofm_alloc); ++ ofm_tensor); + + _return_fn = std::move(fn); + } +@@ -270,13 +273,13 @@ void KernelGenerator::visit(const ir::operation::MaxPool2D &node) + ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); + const auto activation = node.param().activation; + +- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(ifm_alloc, padding.left, padding.right, padding.top, padding.bottom, +- stride.horizontal, stride.vertical, kw, kh, activation, ofm_alloc); ++ fn->configure(ifm_tensor, padding.left, padding.right, padding.top, padding.bottom, ++ stride.horizontal, stride.vertical, kw, kh, activation, ofm_tensor); + + _return_fn = std::move(fn); + } +@@ -295,13 +298,13 @@ void KernelGenerator::visit(const ir::operation::AvgPool2D &node) + ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); + const auto activation = node.param().activation; + +- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(ifm_alloc, padding.left, padding.right, padding.top, padding.bottom, +- stride.horizontal, stride.vertical, kw, kh, activation, ofm_alloc); ++ fn->configure(ifm_tensor, padding.left, padding.right, padding.top, padding.bottom, ++ stride.horizontal, stride.vertical, kw, kh, activation, ofm_tensor); + + _return_fn = std::move(fn); + } +@@ -313,7 +316,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node) + const auto rank = _ctx.at(ofm_index).shape().rank(); + const auto axis = ops::getAxis(rank, node.param().axis, _current_op_seq_layout); + +- auto output_alloc = _tensor_builder->portableAt(ofm_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(ofm_index).get(); + + std::vector input_tensors; + for (auto &ifm_idx : node.getInputs()) +@@ -321,7 +324,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node) + + auto fn = std::make_unique(); + +- fn->configure(input_tensors, axis, output_alloc); ++ fn->configure(input_tensors, axis, output_tensor); + + _return_fn = std::move(fn); + } +@@ -332,13 +335,13 @@ void KernelGenerator::visit(const ir::operation::Fill &node) + const auto input_index{node.getInputs().at(ir::operation::Fill::Input::INPUT)}; + const auto value_index{node.getInputs().at(ir::operation::Fill::Input::VALUE)}; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto input_alloc = _tensor_builder->portableAt(input_index).get(); +- auto value_alloc = _tensor_builder->portableAt(value_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); ++ auto value_tensor = _tensor_builder->portableAt(value_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(input_alloc, value_alloc, output_alloc); ++ fn->configure(input_tensor, value_tensor, output_tensor); + + _return_fn = std::move(fn); + } +@@ -353,15 +356,15 @@ void KernelGenerator::visit(const ir::operation::FullyConnected &node) + const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)}; + const auto activation = node.param().activation; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto input_alloc = _tensor_builder->portableAt(input_index).get(); +- auto weight_alloc = _tensor_builder->portableAt(weight_index).get(); +- auto bias_alloc = ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); ++ auto weight_tensor = _tensor_builder->portableAt(weight_index).get(); ++ auto bias_tensor = + bias_index.undefined() ? nullptr : _tensor_builder->portableAt(bias_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(input_alloc, weight_alloc, bias_alloc, activation, output_alloc); ++ fn->configure(input_tensor, weight_tensor, bias_tensor, activation, output_tensor); + + _return_fn = std::move(fn); + } +@@ -371,21 +374,21 @@ void KernelGenerator::visit(const ir::operation::Reshape &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)}; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto input_alloc = _tensor_builder->portableAt(input_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); + + // optional 2nd input +- IPortableTensor *shape_alloc = nullptr; ++ IPortableTensor *shape_tensor = nullptr; + + if (node.getInputs().size() == 2) + { + const auto shape_index{node.getInputs().at(ir::operation::Reshape::Input::SHAPE)}; +- shape_alloc = _tensor_builder->portableAt(shape_index).get(); ++ shape_tensor = _tensor_builder->portableAt(shape_index).get(); + } + + auto fn = std::make_unique(); + +- fn->configure(input_alloc, shape_alloc, output_alloc); ++ fn->configure(input_tensor, shape_tensor, output_tensor); + _return_fn = std::move(fn); + } + +@@ -394,13 +397,13 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)}; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto input_alloc = _tensor_builder->portableAt(input_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); + + // Squeeze can share same kernel with reshape + auto fn = std::make_unique(); + +- fn->configure(input_alloc, nullptr, output_alloc); ++ fn->configure(input_tensor, nullptr, output_tensor); + + _return_fn = std::move(fn); + } +@@ -412,12 +415,12 @@ void KernelGenerator::visit(const ir::operation::Softmax &node) + + const auto beta = node.param().beta; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto input_alloc = _tensor_builder->portableAt(input_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(input_alloc, beta, output_alloc); ++ fn->configure(input_tensor, beta, output_tensor); + + _return_fn = std::move(fn); + } +@@ -430,13 +433,13 @@ void KernelGenerator::visit(const ir::operation::Add &node) + + const auto activation = node.param().activation; + +- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); +- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); +- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); ++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); ++ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); ++ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc); ++ fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor); + + _return_fn = std::move(fn); + } +@@ -447,15 +450,15 @@ void KernelGenerator::visit(const ir::operation::Comparison &node) + const auto lhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)}; + const auto rhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)}; + +- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); +- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); +- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); ++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); ++ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); ++ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); + + auto comparison_type = node.param().comparison_type; + + auto fn = std::make_unique(); + +- fn->configure(lhs_alloc, rhs_alloc, comparison_type, ofm_alloc); ++ fn->configure(lhs_tensor, rhs_tensor, comparison_type, ofm_tensor); + + _return_fn = std::move(fn); + } +@@ -466,11 +469,11 @@ void KernelGenerator::visit(const ir::operation::Gather &node) + const auto input_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)}; + const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)}; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto input_alloc = _tensor_builder->portableAt(input_index).get(); +- auto indices_alloc = _tensor_builder->portableAt(indices_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); ++ auto indices_tensor = _tensor_builder->portableAt(indices_index).get(); + +- const auto backend_layout = output_alloc->layout(); ++ const auto backend_layout = output_tensor->layout(); + UNUSED_RELEASE(backend_layout); + + // NOTE The frontend layout and backend layout must be the same for this operation. +@@ -481,8 +484,8 @@ void KernelGenerator::visit(const ir::operation::Gather &node) + // a model. For example, if a model in NHWC has this operation as output rank == 4, indices + // rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W + // and C are not sequential in NCHW. So the backend in NCHW cannot handle this case. +- assert(backend_layout == input_alloc->layout()); +- assert(backend_layout == indices_alloc->layout()); ++ assert(backend_layout == input_tensor->layout()); ++ assert(backend_layout == indices_tensor->layout()); + const auto &input_shape = _ctx.at(input_index).shape(); + UNUSED_RELEASE(input_shape); + assert(input_shape.rank() < 4 || _current_op_seq_layout == backend_layout); +@@ -492,7 +495,7 @@ void KernelGenerator::visit(const ir::operation::Gather &node) + + auto fn = std::make_unique(); + +- fn->configure(input_alloc, indices_alloc, output_alloc, axis_value); ++ fn->configure(input_tensor, indices_tensor, output_tensor, axis_value); + + _return_fn = std::move(fn); + } +@@ -506,13 +509,13 @@ void KernelGenerator::visit(const ir::operation::Sub &node) + + const auto activation = node.param().activation; + +- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); +- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); +- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); ++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); ++ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); ++ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc); ++ fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor); + + _return_fn = std::move(fn); + } +@@ -526,13 +529,13 @@ void KernelGenerator::visit(const ir::operation::Mul &node) + + const auto activation = node.param().activation; + +- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); +- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); +- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); ++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); ++ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); ++ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc); ++ fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor); + + _return_fn = std::move(fn); + } +@@ -547,18 +550,18 @@ void KernelGenerator::visit(const ir::operation::OneHot &node) + + const auto axis = node.param().axis; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto indices_alloc = _tensor_builder->portableAt(indices_index).get(); +- auto depth_alloc = _tensor_builder->portableAt(depth_index).get(); +- auto onvalue_alloc = _tensor_builder->portableAt(onvalue_index).get(); +- auto offvalue_alloc = _tensor_builder->portableAt(offvalue_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto indices_tensor = _tensor_builder->portableAt(indices_index).get(); ++ auto depth_tensor = _tensor_builder->portableAt(depth_index).get(); ++ auto onvalue_tensor = _tensor_builder->portableAt(onvalue_index).get(); ++ auto offvalue_tensor = _tensor_builder->portableAt(offvalue_index).get(); + +- assert(indices_alloc->data_type() == OperandType::INT32); +- assert(axis <= static_cast(indices_alloc->num_dimensions())); ++ assert(indices_tensor->data_type() == OperandType::INT32); ++ assert(axis <= static_cast(indices_tensor->num_dimensions())); + + auto fn = std::make_unique(); + +- fn->configure(indices_alloc, depth_alloc, onvalue_alloc, offvalue_alloc, output_alloc, axis); ++ fn->configure(indices_tensor, depth_tensor, onvalue_tensor, offvalue_tensor, output_tensor, axis); + + _return_fn = std::move(fn); + } +@@ -572,13 +575,13 @@ void KernelGenerator::visit(const ir::operation::Div &node) + + const auto activation = node.param().activation; + +- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); +- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); +- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); ++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); ++ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); ++ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc); ++ fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor); + + _return_fn = std::move(fn); + } +@@ -587,16 +590,16 @@ void KernelGenerator::visit(const ir::operation::Einsum &node) + { + const auto ofm_index{node.getOutputs().at(0)}; + +- auto output_alloc = _tensor_builder->portableAt(ofm_index).get(); +- std::vector input_allocs; ++ auto output_tensor = _tensor_builder->portableAt(ofm_index).get(); ++ std::vector input_tensors; + for (auto &ifm_idx : node.getInputs()) +- input_allocs.emplace_back(_tensor_builder->portableAt(ifm_idx).get()); ++ input_tensors.emplace_back(_tensor_builder->portableAt(ifm_idx).get()); + + const auto equation = node.param().equation; + + auto fn = std::make_unique(); + +- fn->configure(input_allocs, equation, output_alloc); ++ fn->configure(input_tensors, equation, output_tensor); + + _return_fn = std::move(fn); + } +@@ -605,14 +608,14 @@ void KernelGenerator::visit(const ir::operation::Custom &node) + { + auto fill_op_info = [&](const ir::OperandIndexSequence &opSeq, + std::vector &types, +- std::vector> &allocs) { ++ std::vector> &tensors) { + for (auto &idx : opSeq) + { + const auto &operand = _ctx.at(idx); + // TODO make sure using `_current_op_seq_layout` is correct for custom operations + types.emplace_back(custom::TypeInfo{operand.shape(), operand.typeInfo().type()}); +- auto in_alloc = _tensor_builder->portableAt(idx); +- allocs.emplace_back(in_alloc); ++ auto in_tensor = _tensor_builder->portableAt(idx); ++ tensors.emplace_back(in_tensor); + } + }; + +@@ -634,12 +637,12 @@ void KernelGenerator::visit(const ir::operation::Exp &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)}; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto input_alloc = _tensor_builder->portableAt(input_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(input_alloc, output_alloc); ++ fn->configure(input_tensor, output_tensor); + + _return_fn = std::move(fn); + } +@@ -650,13 +653,13 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node) + const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)}; + const auto axis_index{node.getInputs().at(ir::operation::ExpandDims::Input::AXIS)}; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto input_alloc = _tensor_builder->portableAt(input_index).get(); +- auto axis_alloc = _tensor_builder->portableAt(axis_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); ++ auto axis_tensor = _tensor_builder->portableAt(axis_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(input_alloc, axis_alloc, output_alloc); ++ fn->configure(input_tensor, axis_tensor, output_tensor); + + _return_fn = std::move(fn); + } +@@ -666,12 +669,12 @@ void KernelGenerator::visit(const ir::operation::Logistic &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)}; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto input_alloc = _tensor_builder->portableAt(input_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(input_alloc, output_alloc); ++ fn->configure(input_tensor, output_tensor); + + _return_fn = std::move(fn); + } +@@ -681,12 +684,12 @@ void KernelGenerator::visit(const ir::operation::Tanh &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)}; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto input_alloc = _tensor_builder->portableAt(input_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(input_alloc, output_alloc); ++ fn->configure(input_tensor, output_tensor); + + _return_fn = std::move(fn); + } +@@ -700,7 +703,7 @@ void KernelGenerator::visit(const ir::operation::Pack &node) + + assert(-rank <= axis && axis < rank); + +- auto output_alloc = _tensor_builder->portableAt(ofm_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(ofm_index).get(); + + std::vector input_tensors; + for (auto &ifm_idx : node.getInputs()) +@@ -708,7 +711,7 @@ void KernelGenerator::visit(const ir::operation::Pack &node) + + auto fn = std::make_unique(); + +- fn->configure(input_tensors, axis, output_alloc); ++ fn->configure(input_tensors, axis, output_tensor); + + _return_fn = std::move(fn); + } +@@ -722,7 +725,7 @@ void KernelGenerator::visit(const ir::operation::Unpack &node) + + assert(rank == 0 || (-rank <= axis && axis < rank)); + +- auto input_alloc = _tensor_builder->portableAt(input_index).get(); ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); + + std::vector output_tensors; + for (auto &output_idx : node.getOutputs()) +@@ -732,7 +735,7 @@ void KernelGenerator::visit(const ir::operation::Unpack &node) + + uint32_t axis_resolved = (axis < 0 ? axis + rank : axis); + +- fn->configure(input_alloc, axis_resolved, node.param().num, output_tensors); ++ fn->configure(input_tensor, axis_resolved, node.param().num, output_tensors); + + _return_fn = std::move(fn); + } +@@ -751,8 +754,16 @@ void KernelGenerator::visit(const ir::operation::Pad &node) + + auto fn = std::make_unique(); + +- fn->configure(input, output, pad_base, pad_rank); ++ bool isPadV2 = node.getInputs().size() == 3 ? true : false; ++ const void *value = nullptr; + ++ if (isPadV2) ++ { ++ const auto value_index{node.getInputs().at(ir::operation::Pad::Input::VALUE)}; ++ value = reinterpret_cast(_ctx.at(value_index).data()->base()); ++ } ++ ++ fn->configure(input, output, pad_base, pad_rank, value); + _return_fn = std::move(fn); + } + +@@ -762,13 +773,13 @@ void KernelGenerator::visit(const ir::operation::Max &node) + const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)}; + +- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); +- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); +- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); ++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); ++ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); ++ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(lhs_alloc, rhs_alloc, ofm_alloc); ++ fn->configure(lhs_tensor, rhs_tensor, ofm_tensor); + + _return_fn = std::move(fn); + } +@@ -779,13 +790,13 @@ void KernelGenerator::visit(const ir::operation::Min &node) + const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)}; + +- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); +- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); +- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); ++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); ++ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); ++ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(lhs_alloc, rhs_alloc, ofm_alloc); ++ fn->configure(lhs_tensor, rhs_tensor, ofm_tensor); + + _return_fn = std::move(fn); + } +@@ -795,12 +806,12 @@ void KernelGenerator::visit(const ir::operation::Cast &node) + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)}; + +- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(ifm_alloc, ofm_alloc); ++ fn->configure(ifm_tensor, ofm_tensor); + + _return_fn = std::move(fn); + } +@@ -810,12 +821,12 @@ void KernelGenerator::visit(const ir::operation::Transpose &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Transpose::Input::INPUT)}; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto input_alloc = _tensor_builder->portableAt(input_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(input_alloc, output_alloc, node.param().perm); ++ fn->configure(input_tensor, output_tensor, node.param().perm); + + _return_fn = std::move(fn); + } +@@ -827,15 +838,15 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) + const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)}; + + const auto keep_dims = node.param().keep_dims; +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto input_alloc = _tensor_builder->portableAt(input_index).get(); +- auto axes_alloc = _tensor_builder->portableAt(axes_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); ++ auto axes_tensor = _tensor_builder->portableAt(axes_index).get(); + + if (node.param().reduce_type == ir::operation::Reduce::ReduceType::MEAN) + { + auto fn = std::make_unique(); + +- fn->configure(input_alloc, axes_alloc, output_alloc, keep_dims); ++ fn->configure(input_tensor, axes_tensor, output_tensor, keep_dims); + + _return_fn = std::move(fn); + } +@@ -844,7 +855,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) + auto fn = std::make_unique(); + + const auto reduce_type = convertReduceType(node.param().reduce_type); +- fn->configure(input_alloc, axes_alloc, output_alloc, reduce_type, keep_dims); ++ fn->configure(input_tensor, axes_tensor, output_tensor, reduce_type, keep_dims); + + _return_fn = std::move(fn); + } +@@ -855,12 +866,12 @@ void KernelGenerator::visit(const ir::operation::ReLU &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(0)}; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto input_alloc = _tensor_builder->portableAt(input_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(input_alloc, output_alloc); ++ fn->configure(input_tensor, output_tensor); + + _return_fn = std::move(fn); + } +@@ -872,14 +883,14 @@ void KernelGenerator::visit(const ir::operation::Select &node) + const auto true_index{node.getInputs().at(ir::operation::Select::Input::INPUT_TRUE)}; + const auto false_index{node.getInputs().at(ir::operation::Select::Input::INPUT_FALSE)}; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto condition_alloc = _tensor_builder->portableAt(condition_index).get(); +- auto true_alloc = _tensor_builder->portableAt(true_index).get(); +- auto false_alloc = _tensor_builder->portableAt(false_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto condition_tensor = _tensor_builder->portableAt(condition_index).get(); ++ auto true_tensor = _tensor_builder->portableAt(true_index).get(); ++ auto false_tensor = _tensor_builder->portableAt(false_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(condition_alloc, true_alloc, false_alloc, output_alloc); ++ fn->configure(condition_tensor, true_tensor, false_tensor, output_tensor); + + _return_fn = std::move(fn); + } +@@ -891,14 +902,14 @@ void KernelGenerator::visit(const ir::operation::Slice &node) + const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)}; + const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)}; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto input_alloc = _tensor_builder->portableAt(input_index).get(); +- auto begins_alloc = _tensor_builder->portableAt(begins_index).get(); +- auto sizes_alloc = _tensor_builder->portableAt(sizes_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); ++ auto begins_tensor = _tensor_builder->portableAt(begins_index).get(); ++ auto sizes_tensor = _tensor_builder->portableAt(sizes_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(input_alloc, begins_alloc, sizes_alloc, output_alloc); ++ fn->configure(input_tensor, begins_tensor, sizes_tensor, output_tensor); + + _return_fn = std::move(fn); + } +@@ -911,11 +922,11 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node) + const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)}; + const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)}; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto input_alloc = _tensor_builder->portableAt(input_index).get(); +- auto starts_alloc = _tensor_builder->portableAt(starts_index).get(); +- auto ends_alloc = _tensor_builder->portableAt(ends_index).get(); +- auto strides_alloc = _tensor_builder->portableAt(strides_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); ++ auto starts_tensor = _tensor_builder->portableAt(starts_index).get(); ++ auto ends_tensor = _tensor_builder->portableAt(ends_index).get(); ++ auto strides_tensor = _tensor_builder->portableAt(strides_index).get(); + + auto begin_mask = node.param().begin_mask; + auto end_mask = node.param().end_mask; +@@ -923,7 +934,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node) + + auto fn = std::make_unique(); + +- fn->configure(input_alloc, starts_alloc, ends_alloc, strides_alloc, output_alloc, begin_mask, ++ fn->configure(input_tensor, starts_tensor, ends_tensor, strides_tensor, output_tensor, begin_mask, + end_mask, shrink_axis_mask); + + _return_fn = std::move(fn); +@@ -957,12 +968,12 @@ void KernelGenerator::visit(const ir::operation::Abs &node) + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)}; + +- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(ifm_alloc, ofm_alloc); ++ fn->configure(ifm_tensor, ofm_tensor); + + _return_fn = std::move(fn); + } +@@ -972,12 +983,12 @@ void KernelGenerator::visit(const ir::operation::Sin &node) + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Sin::Input::INPUT)}; + +- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(ifm_alloc, ofm_alloc); ++ fn->configure(ifm_tensor, ofm_tensor); + + _return_fn = std::move(fn); + } +@@ -987,12 +998,12 @@ void KernelGenerator::visit(const ir::operation::Cos &node) + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Cos::Input::INPUT)}; + +- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(ifm_alloc, ofm_alloc); ++ fn->configure(ifm_tensor, ofm_tensor); + + _return_fn = std::move(fn); + } +@@ -1002,12 +1013,12 @@ void KernelGenerator::visit(const ir::operation::RSQRT &node) + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)}; + +- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(ifm_alloc, ofm_alloc); ++ fn->configure(ifm_tensor, ofm_tensor); + + _return_fn = std::move(fn); + } +@@ -1017,12 +1028,12 @@ void KernelGenerator::visit(const ir::operation::Shape &node) + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Shape::Input::INPUT)}; + +- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(ifm_alloc, ofm_alloc); ++ fn->configure(ifm_tensor, ofm_tensor); + + _return_fn = std::move(fn); + } +@@ -1033,13 +1044,13 @@ void KernelGenerator::visit(const ir::operation::Reverse &node) + const auto input_index{node.getInputs().at(ir::operation::Reverse::INPUT)}; + const auto axis_index{node.getInputs().at(ir::operation::Reverse::AXIS)}; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto input_alloc = _tensor_builder->portableAt(input_index).get(); +- auto axis_alloc = _tensor_builder->portableAt(axis_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); ++ auto axis_tensor = _tensor_builder->portableAt(axis_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(input_alloc, axis_alloc, output_alloc); ++ fn->configure(input_tensor, axis_tensor, output_tensor); + + _return_fn = std::move(fn); + } +@@ -1049,12 +1060,12 @@ void KernelGenerator::visit(const ir::operation::Neg &node) + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)}; + +- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(ifm_alloc, ofm_alloc); ++ fn->configure(ifm_tensor, ofm_tensor); + + _return_fn = std::move(fn); + } +@@ -1066,12 +1077,12 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node) + + const auto axis = node.param().axis; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto input_alloc = _tensor_builder->portableAt(input_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(input_alloc, output_alloc, axis, /* is_arg_max */ true); ++ fn->configure(input_tensor, output_tensor, axis, /* is_arg_max */ true); + + _return_fn = std::move(fn); + } +@@ -1082,13 +1093,13 @@ void KernelGenerator::visit(const ir::operation::Pow &node) + const auto lhs_index{node.getInputs().at(ir::operation::Pow::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Pow::RHS)}; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); +- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); ++ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(lhs_alloc, rhs_alloc, ir::Activation::NONE, output_alloc); ++ fn->configure(lhs_tensor, rhs_tensor, ir::Activation::NONE, output_tensor); + + _return_fn = std::move(fn); + } +@@ -1098,12 +1109,12 @@ void KernelGenerator::visit(const ir::operation::Log &node) + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Log::Input::INPUT)}; + +- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); +- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); ++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); ++ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(ifm_alloc, ofm_alloc); ++ fn->configure(ifm_tensor, ofm_tensor); + + _return_fn = std::move(fn); + } +@@ -1113,12 +1124,12 @@ void KernelGenerator::visit(const ir::operation::Round &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Round::INPUT)}; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto input_alloc = _tensor_builder->portableAt(input_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(input_alloc, output_alloc); ++ fn->configure(input_tensor, output_tensor); + + _return_fn = std::move(fn); + } +@@ -1128,12 +1139,12 @@ void KernelGenerator::visit(const ir::operation::LogicalNot &node) + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::LogicalNot::INPUT)}; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto input_alloc = _tensor_builder->portableAt(input_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(input_alloc, output_alloc); ++ fn->configure(input_tensor, output_tensor); + + _return_fn = std::move(fn); + } +@@ -1144,28 +1155,43 @@ void KernelGenerator::visit(const ir::operation::LogicalOr &node) + const auto lhs_index{node.getInputs().at(0)}; + const auto rhs_index{node.getInputs().at(1)}; + +- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); +- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); +- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); ++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); ++ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); ++ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(lhs_alloc, rhs_alloc, ofm_alloc); ++ fn->configure(lhs_tensor, rhs_tensor, ofm_tensor); + + _return_fn = std::move(fn); + } + +-void KernelGenerator::visit(const ir::operation::ZerosLike &node) ++void KernelGenerator::visit(const ir::operation::L2Normalization &node) + { + const auto output_index{node.getOutputs().at(0)}; +- const auto input_index{node.getInputs().at(ir::operation::ZerosLike::INPUT)}; ++ const auto input_index{node.getInputs().at(0)}; + + auto output_alloc = _tensor_builder->portableAt(output_index).get(); + auto input_alloc = _tensor_builder->portableAt(input_index).get(); + +- auto fn = std::make_unique(); ++ auto fn = std::make_unique(); + + fn->configure(input_alloc, output_alloc); ++ ++ _return_fn = std::move(fn); ++} ++ ++void KernelGenerator::visit(const ir::operation::ZerosLike &node) ++{ ++ const auto output_index{node.getOutputs().at(0)}; ++ const auto input_index{node.getInputs().at(ir::operation::ZerosLike::INPUT)}; ++ ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); ++ ++ auto fn = std::make_unique(); ++ ++ fn->configure(input_tensor, output_tensor); + _return_fn = std::move(fn); + } + +@@ -1176,14 +1202,14 @@ void KernelGenerator::visit(const ir::operation::Range &node) + const auto limit_index{node.getInputs().at(ir::operation::Range::LIMIT)}; + const auto delta_index{node.getInputs().at(ir::operation::Range::DELTA)}; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto start_alloc = _tensor_builder->portableAt(start_index).get(); +- auto limit_alloc = _tensor_builder->portableAt(limit_index).get(); +- auto delta_alloc = _tensor_builder->portableAt(delta_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto start_tensor = _tensor_builder->portableAt(start_index).get(); ++ auto limit_tensor = _tensor_builder->portableAt(limit_index).get(); ++ auto delta_tensor = _tensor_builder->portableAt(delta_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(start_alloc, limit_alloc, delta_alloc, output_alloc); ++ fn->configure(start_tensor, limit_tensor, delta_tensor, output_tensor); + _return_fn = std::move(fn); + } + +@@ -1193,13 +1219,13 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node) + const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)}; + +- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); +- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); +- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); ++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); ++ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); ++ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(lhs_alloc, rhs_alloc, ofm_alloc); ++ fn->configure(lhs_tensor, rhs_tensor, ofm_tensor); + _return_fn = std::move(fn); + } + +@@ -1209,13 +1235,13 @@ void KernelGenerator::visit(const ir::operation::Tile &node) + const auto input_index{node.getInputs().at(ir::operation::Tile::INPUT)}; + const auto multiples_index{node.getInputs().at(ir::operation::Tile::MULTIPLES)}; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto input_alloc = _tensor_builder->portableAt(input_index).get(); +- auto multiples_alloc = _tensor_builder->portableAt(multiples_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); ++ auto multiples_tensor = _tensor_builder->portableAt(multiples_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(input_alloc, multiples_alloc, output_alloc); ++ fn->configure(input_tensor, multiples_tensor, output_tensor); + _return_fn = std::move(fn); + } + +@@ -1226,14 +1252,14 @@ void KernelGenerator::visit(const ir::operation::MatrixBandPart &node) + const auto num_lower_index{node.getInputs().at(ir::operation::MatrixBandPart::NUM_LOWER_DIAG)}; + const auto num_upper_index{node.getInputs().at(ir::operation::MatrixBandPart::NUM_UPPER_DIAG)}; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto input_alloc = _tensor_builder->portableAt(input_index).get(); +- auto num_lower_alloc = _tensor_builder->portableAt(num_lower_index).get(); +- auto num_upper_alloc = _tensor_builder->portableAt(num_upper_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); ++ auto num_lower_tensor = _tensor_builder->portableAt(num_lower_index).get(); ++ auto num_upper_tensor = _tensor_builder->portableAt(num_upper_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(input_alloc, num_lower_alloc, num_upper_alloc, output_alloc); ++ fn->configure(input_tensor, num_lower_tensor, num_upper_tensor, output_tensor); + _return_fn = std::move(fn); + } + +@@ -1243,16 +1269,16 @@ void KernelGenerator::visit(const ir::operation::BatchMatMul &node) + const auto lhs_index{node.getInputs().at(ir::operation::BatchMatMul::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::BatchMatMul::RHS)}; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); +- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); ++ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); + + const auto adj_x = node.param().adj_x; + const auto adj_y = node.param().adj_y; + + auto fn = std::make_unique(); + +- fn->configure(lhs_alloc, rhs_alloc, adj_x, adj_y, output_alloc); ++ fn->configure(lhs_tensor, rhs_tensor, adj_x, adj_y, output_tensor); + _return_fn = std::move(fn); + } + +@@ -1262,13 +1288,13 @@ void KernelGenerator::visit(const ir::operation::BroadcastTo &node) + const auto input_index{node.getInputs().at(ir::operation::BroadcastTo::INPUT)}; + const auto shape_index{node.getInputs().at(ir::operation::BroadcastTo::SHAPE)}; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto input_alloc = _tensor_builder->portableAt(input_index).get(); +- auto shape_alloc = _tensor_builder->portableAt(shape_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); ++ auto shape_tensor = _tensor_builder->portableAt(shape_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(input_alloc, shape_alloc, output_alloc); ++ fn->configure(input_tensor, shape_tensor, output_tensor); + + _return_fn = std::move(fn); + } +@@ -1277,10 +1303,10 @@ void KernelGenerator::visit(const ir::operation::FusedBatchNorm &node) + { + const auto ofm_index{node.getOutputs().at(0)}; + +- auto output_alloc = _tensor_builder->portableAt(ofm_index).get(); +- std::vector input_allocs; ++ auto output_tensor = _tensor_builder->portableAt(ofm_index).get(); ++ std::vector input_tensors; + for (auto &ifm_idx : node.getInputs()) +- input_allocs.emplace_back(_tensor_builder->portableAt(ifm_idx).get()); ++ input_tensors.emplace_back(_tensor_builder->portableAt(ifm_idx).get()); + + const auto epsilon = node.param().epsilon; + const auto is_training = node.param().is_training; +@@ -1288,7 +1314,7 @@ void KernelGenerator::visit(const ir::operation::FusedBatchNorm &node) + + auto fn = std::make_unique(); + +- fn->configure(input_allocs, epsilon, is_training, data_format, output_alloc); ++ fn->configure(input_tensors, epsilon, is_training, data_format, output_tensor); + + _return_fn = std::move(fn); + } +@@ -1301,12 +1327,12 @@ void KernelGenerator::visit(const ir::operation::LogSoftmax &node) + const auto beta = node.param().beta; + const auto axis = node.param().axis; + +- auto output_alloc = _tensor_builder->at(output_index).get(); +- auto input_alloc = _tensor_builder->at(input_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(input_alloc, beta, axis, output_alloc); ++ fn->configure(input_tensor, beta, axis, output_tensor); + + _return_fn = std::move(fn); + } +@@ -1318,14 +1344,45 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node) + const auto block_shape_index{node.getInputs().at(ir::operation::SpaceToBatchND::BLOCK_SIZE)}; + const auto padding_index{node.getInputs().at(ir::operation::SpaceToBatchND::PADDINGS)}; + +- auto output_alloc = _tensor_builder->portableAt(output_index).get(); +- auto input_alloc = _tensor_builder->portableAt(input_index).get(); +- auto block_shape_alloc = _tensor_builder->portableAt(block_shape_index).get(); +- auto padding_alloc = _tensor_builder->portableAt(padding_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); ++ auto block_shape_tensor = _tensor_builder->portableAt(block_shape_index).get(); ++ auto padding_tensor = _tensor_builder->portableAt(padding_index).get(); + + auto fn = std::make_unique(); + +- fn->configure(input_alloc, block_shape_alloc, padding_alloc, output_alloc); ++ fn->configure(input_tensor, block_shape_tensor, padding_tensor, output_tensor); ++ ++ _return_fn = std::move(fn); ++} ++ ++void KernelGenerator::visit(const ir::operation::Quantize &node) ++{ ++ const auto input_index{node.getInputs().at(ir::operation::Quantize::Input::INPUT)}; ++ const auto output_index{node.getOutputs().at(0)}; ++ ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ ++ auto fn = std::make_unique(); ++ ++ fn->configure(input_tensor, output_tensor); ++ ++ _return_fn = std::move(fn); ++} ++ ++void KernelGenerator::visit(const ir::operation::SpaceToDepth &node) ++{ ++ const auto input_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)}; ++ const auto output_index{node.getOutputs().at(0)}; ++ auto block_size = node.param().block_size; ++ ++ auto input_tensor = _tensor_builder->portableAt(input_index).get(); ++ auto output_tensor = _tensor_builder->portableAt(output_index).get(); ++ ++ auto fn = std::make_unique(); ++ ++ fn->configure(input_tensor, block_size, output_tensor); + + _return_fn = std::move(fn); + } +diff --git a/runtime/onert/backend/cpu/KernelGenerator.h b/runtime/onert/backend/cpu/KernelGenerator.h +index d6f4c28..f564bf8 100644 +--- a/runtime/onert/backend/cpu/KernelGenerator.h ++++ b/runtime/onert/backend/cpu/KernelGenerator.h +@@ -94,6 +94,7 @@ public: + void visit(const ir::operation::SquaredDifference &) override; + void visit(const ir::operation::Tile &) override; + void visit(const ir::operation::LogicalOr &) override; ++ void visit(const ir::operation::L2Normalization &) override; + void visit(const ir::operation::Range &) override; + void visit(const ir::operation::MatrixBandPart &) override; + void visit(const ir::operation::BatchMatMul &) override; +@@ -101,6 +102,8 @@ public: + void visit(const ir::operation::FusedBatchNorm &) override; + void visit(const ir::operation::LogSoftmax &) override; + void visit(const ir::operation::SpaceToBatchND &) override; ++ void visit(const ir::operation::Quantize &) override; ++ void visit(const ir::operation::SpaceToDepth &) override; + + private: + const ir::Operands &_ctx; +diff --git a/runtime/onert/backend/cpu/StaticTensorManager.cc b/runtime/onert/backend/cpu/StaticTensorManager.cc +new file mode 100644 +index 0000000..8723072 +--- /dev/null ++++ b/runtime/onert/backend/cpu/StaticTensorManager.cc +@@ -0,0 +1,104 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#include "StaticTensorManager.h" ++#include "Tensor.h" ++ ++#include ++ ++namespace onert ++{ ++namespace backend ++{ ++namespace cpu ++{ ++ ++StaticTensorManager::StaticTensorManager(const std::shared_ptr ®) ++ : _nonconst_mgr{new cpu_common::MemoryManager()}, _tensors{reg} ++{ ++ // DO NOTHING ++} ++ ++void StaticTensorManager::allocateNonconsts(void) ++{ ++ _nonconst_mgr->allocate(); ++ ++ for (auto &pair : _tensors->native_tensors()) ++ { ++ const auto &ind = pair.first; ++ auto tensor = pair.second; ++ if (!_as_constants[ind] && !tensor->is_dynamic()) ++ { ++ auto *buffer = _nonconst_mgr->getBuffer(ind); ++ tensor->setBuffer(buffer); ++ ++ VERBOSE(CPU_StaticTensorManager) << "TENSOR(#" << ind.value() ++ << "): " << static_cast(buffer) << std::endl; ++ } ++ } ++} ++ ++void StaticTensorManager::deallocateNonconsts(void) { _nonconst_mgr->deallocate(); } ++ ++void StaticTensorManager::buildTensor(const ir::OperandIndex &ind, ++ const ir::OperandInfo &tensor_info, ir::Layout backend_layout, ++ bool as_const) ++{ ++ assert(!_tensors->getITensor(ind)); ++ if (as_const) ++ { ++ auto tensor = std::make_shared(tensor_info, backend_layout); ++ _tensors->setNativeTensor(ind, tensor); ++ } ++ else ++ { ++ auto tensor = std::make_shared(tensor_info, backend_layout); ++ _tensors->setNativeTensor(ind, tensor); ++ } ++ _as_constants[ind] = as_const; ++} ++ ++void StaticTensorManager::claimPlan(const ir::OperandIndex &ind, uint32_t size) ++{ ++ assert(_tensors->getITensor(ind)); ++ ++ // This method is called only when a tensor has proper shape ++ assert(!_tensors->getITensor(ind)->is_dynamic()); ++ ++ if (!_as_constants[ind]) ++ _nonconst_mgr->claimPlan(ind, size); ++} ++ ++void StaticTensorManager::releasePlan(const ir::OperandIndex &ind) ++{ ++ assert(_tensors->getITensor(ind)); ++ ++ // This method is called only when a tensor has proper shape ++ assert(!_tensors->getITensor(ind)->is_dynamic()); ++ ++ if (!_as_constants[ind]) ++ _nonconst_mgr->releasePlan(ind); ++} ++ ++void StaticTensorManager::iterate(const std::function &fn) ++{ ++ for (const auto &it : _tensors->native_tensors()) ++ fn(it.first); ++} ++ ++} // namespace cpu ++} // namespace backend ++} // namespace onert +diff --git a/runtime/onert/backend/cpu/StaticTensorManager.h b/runtime/onert/backend/cpu/StaticTensorManager.h +new file mode 100644 +index 0000000..66243a5 +--- /dev/null ++++ b/runtime/onert/backend/cpu/StaticTensorManager.h +@@ -0,0 +1,61 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#ifndef __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__ ++#define __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__ ++ ++#include "backend/IStaticTensorManager.h" ++#include "backend/cpu_common/MemoryManager.h" ++#include "backend/cpu_common/TensorRegistry.h" ++#include "backend/ITensorManager.h" ++#include "ir/OperandIndexMap.h" ++#include "ir/OperandInfo.h" ++ ++namespace onert ++{ ++namespace backend ++{ ++namespace cpu ++{ ++ ++class StaticTensorManager : public backend::IStaticTensorManager ++{ ++public: ++ StaticTensorManager(const std::shared_ptr ®); ++ virtual ~StaticTensorManager() = default; ++ ++ void allocateNonconsts(void); ++ void deallocateNonconsts(void); ++ ++ void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &tensor_info, ++ ir::Layout backend_layout, bool as_const); ++ ++ void claimPlan(const ir::OperandIndex &ind, uint32_t size); ++ void releasePlan(const ir::OperandIndex &ind); ++ ++ void iterate(const std::function &fn); ++ ++private: ++ std::unique_ptr _nonconst_mgr; ++ const std::shared_ptr _tensors; ++ ir::OperandIndexMap _as_constants; ++}; ++ ++} // namespace cpu ++} // namespace backend ++} // namespace onert ++ ++#endif // __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__ +diff --git a/runtime/onert/backend/cpu/Tensor.h b/runtime/onert/backend/cpu/Tensor.h +index 4dd251b..da16d05 100644 +--- a/runtime/onert/backend/cpu/Tensor.h ++++ b/runtime/onert/backend/cpu/Tensor.h +@@ -29,8 +29,14 @@ namespace cpu + + using Tensor = cpu_common::Tensor; + +-// Tensor which has data from external. To support this, assume below things +-// no padding, always NHWC layout, constant tensor and not dynamic ++/** ++ * @brief Class that uses data from external memory that is not managed by a backend ++ * instead of allocating and copying the data. ExternalTensor's data pointer points to ++ * an address of memory such as where memory is already allocated, or mmapped area. ++ * This is meaning that ExternalTensor can take all of types' ir::Data. ++ * To support this, assume below things no padding, always NHWC layout, ++ * constant tensor and not dynamic. ++ */ + class ExternalTensor : public Tensor + { + public: +@@ -45,6 +51,11 @@ public: + } + + public: ++ /** ++ * @brief set Data to be shared from external so that this ExternalTensor will not be ++ * allocated on CPU backend ++ * @param[in] data data of Operand to be set ++ */ + void setData(const std::shared_ptr data) + { + assert(data != nullptr); +diff --git a/runtime/onert/backend/cpu/TensorBuilder.cc b/runtime/onert/backend/cpu/TensorBuilder.cc +index 886e8d8..7eb3ce8 100644 +--- a/runtime/onert/backend/cpu/TensorBuilder.cc ++++ b/runtime/onert/backend/cpu/TensorBuilder.cc +@@ -29,7 +29,7 @@ namespace cpu + + TensorBuilder::TensorBuilder() + : _tensor_reg{new cpu_common::TensorRegistry()}, +- _static_tensor_mgr{new cpu_common::StaticTensorManager(_tensor_reg)}, ++ _static_tensor_mgr{new StaticTensorManager(_tensor_reg)}, + _dynamic_tensor_mgr{new cpu_common::DynamicTensorManager(_tensor_reg)} + { + /* empty */ +@@ -77,11 +77,7 @@ bool TensorBuilder::isRegistered(const ir::OperandIndex &ind) const + return _tensor_info_map.find(ind) != _tensor_info_map.end(); + } + +-void TensorBuilder::prepare(void) +-{ +- _static_tensor_mgr->allocateConsts(); +- _static_tensor_mgr->allocateNonconsts(); +-} ++void TensorBuilder::prepare(void) { _static_tensor_mgr->allocateNonconsts(); } + + void TensorBuilder::allocate() + { +@@ -99,17 +95,17 @@ std::shared_ptr TensorBuilder::portableAt(const ir::OperandInde + return _tensor_reg->getPortableTensor(ind); + } + +-bool TensorBuilder::setExternalTensor(const ir::OperandIndex &ind, +- const std::shared_ptr &tensor) ++bool TensorBuilder::setMigrantTensor(const ir::OperandIndex &ind, ++ const std::shared_ptr &tensor) + { +- return _tensor_reg->setExternalTensor(ind, tensor); ++ return _tensor_reg->setMigrantTensor(ind, tensor); + } + + void TensorBuilder::iterate(const IterateFunction &fn) { _static_tensor_mgr->iterate(fn); } + +-std::shared_ptr TensorBuilder::at(const ir::OperandIndex &ind) ++std::shared_ptr TensorBuilder::at(const ir::OperandIndex &ind) + { +- return _tensor_reg->getManagedTensor(ind); ++ return _tensor_reg->getNativeTensor(ind); + } + + std::unique_ptr TensorBuilder::releaseStaticTensorManager(void) +diff --git a/runtime/onert/backend/cpu/TensorBuilder.h b/runtime/onert/backend/cpu/TensorBuilder.h +index ba25451..12ca28c 100644 +--- a/runtime/onert/backend/cpu/TensorBuilder.h ++++ b/runtime/onert/backend/cpu/TensorBuilder.h +@@ -18,13 +18,14 @@ + #define __ONERT_BACKEND_CPU_TENSOR_BUILDER_H__ + + #include +-#include + #include +-#include + + #include + #include + ++#include "StaticTensorManager.h" ++#include "Tensor.h" ++ + #include + + namespace onert +@@ -80,16 +81,16 @@ public: + * If not, program will crash with assert or exception. + * @return shared_ptr + */ +- std::shared_ptr at(const ir::OperandIndex &ind); ++ std::shared_ptr at(const ir::OperandIndex &ind); + std::shared_ptr portableAt(const ir::OperandIndex &ind); +- bool setExternalTensor(const ir::OperandIndex &ind, +- const std::shared_ptr &tensor) override; ++ bool setMigrantTensor(const ir::OperandIndex &ind, ++ const std::shared_ptr &tensor) override; + + std::shared_ptr tensorRegistry() override { return _tensor_reg; } + + private: + const std::shared_ptr _tensor_reg; +- std::unique_ptr _static_tensor_mgr; ++ std::unique_ptr _static_tensor_mgr; + std::unique_ptr _dynamic_tensor_mgr; + ir::OperandIndexMap _tensor_info_map; + }; +diff --git a/runtime/onert/backend/cpu/ops/CompareLayer.cc b/runtime/onert/backend/cpu/ops/CompareLayer.cc +index f557f3a..adf902a 100644 +--- a/runtime/onert/backend/cpu/ops/CompareLayer.cc ++++ b/runtime/onert/backend/cpu/ops/CompareLayer.cc +@@ -17,6 +17,7 @@ + + #include "OperationUtils.h" + ++#include + #include + using namespace nnfw::cker; + namespace onert +@@ -34,6 +35,14 @@ namespace + using OpType = onert::ir::operation::Comparison::ComparisonType; + using namespace onert::backend::cpu; + ++// Assumes these enum values to be in the order like this ++static_assert(static_cast(OpType::Equal) == 0, "An OpType value has changed!"); ++static_assert(static_cast(OpType::NotEqual) == 1, "An OpType value has changed!"); ++static_assert(static_cast(OpType::Greater) == 2, "An OpType value has changed!"); ++static_assert(static_cast(OpType::GreaterEqual) == 3, "An OpType value has changed!"); ++static_assert(static_cast(OpType::Less) == 4, "An OpType value has changed!"); ++static_assert(static_cast(OpType::LessEqual) == 5, "An OpType value has changed!"); ++ + template + void compareQuant8(const IPortableTensor *lhs, const IPortableTensor *rhs, IPortableTensor *output, + OpType op_type) +@@ -52,95 +61,33 @@ void compareQuant8(const IPortableTensor *lhs, const IPortableTensor *rhs, IPort + ¶ms.input2_shift); + params.is_broadcast = !HaveSameShapes(lhs, rhs); + +- if (params.is_broadcast) +- { +- switch (op_type) +- { +- case OpType::Equal: +- Broadcast4DSlowEqualWithScaling( +- params, getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), +- getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), +- getExtendedTensorShape(output), reinterpret_cast(output->buffer())); +- break; +- case OpType::NotEqual: +- Broadcast4DSlowNotEqualWithScaling( +- params, getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), +- getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), +- getExtendedTensorShape(output), reinterpret_cast(output->buffer())); +- break; +- case OpType::Greater: +- Broadcast4DSlowGreaterWithScaling( +- params, getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), +- getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), +- getExtendedTensorShape(output), reinterpret_cast(output->buffer())); +- break; +- case OpType::GreaterEqual: +- Broadcast4DSlowGreaterEqualWithScaling( +- params, getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), +- getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), +- getExtendedTensorShape(output), reinterpret_cast(output->buffer())); +- break; +- case OpType::Less: +- Broadcast4DSlowLessWithScaling( +- params, getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), +- getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), +- getExtendedTensorShape(output), reinterpret_cast(output->buffer())); +- break; +- case OpType::LessEqual: +- Broadcast4DSlowLessEqualWithScaling( +- params, getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), +- getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), +- getExtendedTensorShape(output), reinterpret_cast(output->buffer())); +- break; +- default: +- throw std::runtime_error{"Invalid OpType for CompareLayer"}; +- } +- } +- else // if (requires_broadcast == false) +- { +- switch (op_type) +- { +- case OpType::Equal: +- EqualWithScaling(params, getExtendedTensorShape(lhs), +- reinterpret_cast(lhs->buffer()), getExtendedTensorShape(rhs), +- reinterpret_cast(rhs->buffer()), getExtendedTensorShape(output), +- reinterpret_cast(output->buffer())); +- break; +- case OpType::NotEqual: +- NotEqualWithScaling( +- params, getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), +- getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), +- getExtendedTensorShape(output), reinterpret_cast(output->buffer())); +- break; +- case OpType::Greater: +- GreaterWithScaling( +- params, getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), +- getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), +- getExtendedTensorShape(output), reinterpret_cast(output->buffer())); +- break; +- case OpType::GreaterEqual: +- GreaterEqualWithScaling( +- params, getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), +- getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), +- getExtendedTensorShape(output), reinterpret_cast(output->buffer())); +- break; +- case OpType::Less: +- LessWithScaling(params, getExtendedTensorShape(lhs), +- reinterpret_cast(lhs->buffer()), getExtendedTensorShape(rhs), +- reinterpret_cast(rhs->buffer()), getExtendedTensorShape(output), +- reinterpret_cast(output->buffer())); +- break; +- case OpType::LessEqual: +- LessEqualWithScaling( +- params, getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), +- getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), +- getExtendedTensorShape(output), reinterpret_cast(output->buffer())); +- break; +- default: +- throw std::runtime_error{"Invalid OpType for CompareLayer"}; +- } +- } +- return; ++ using CompareFunction = ++ void (*)(ComparisonParams & params, const Shape &input1_shape, const T *input1_data, ++ const Shape &input2_shape, const T *input2_data, const Shape &output_shape, ++ bool *output_data); ++ ++ static const CompareFunction broadcast_fns[] = { ++ Broadcast4DSlowEqualWithScaling, Broadcast4DSlowNotEqualWithScaling, ++ Broadcast4DSlowGreaterWithScaling, Broadcast4DSlowGreaterEqualWithScaling, ++ Broadcast4DSlowLessWithScaling, Broadcast4DSlowLessEqualWithScaling, ++ }; ++ static const CompareFunction non_broadcast_fns[] = { ++ EqualWithScaling, NotEqualWithScaling, GreaterWithScaling, ++ GreaterEqualWithScaling, LessWithScaling, LessEqualWithScaling, ++ }; ++ ++ static_assert(sizeof(broadcast_fns) == sizeof(non_broadcast_fns), ++ "Sizes of broadcast_fns and non_broadcast_fns must match!"); ++ ++ auto index = static_cast(op_type); ++ if (index < 0 || index >= static_cast(sizeof(broadcast_fns) / sizeof(broadcast_fns[0]))) ++ throw std::runtime_error{"Invalid OpType for CompareLayer"}; ++ ++ CompareFunction fn = (params.is_broadcast ? broadcast_fns[index] : non_broadcast_fns[index]); ++ ++ fn(params, getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), ++ getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), ++ getExtendedTensorShape(output), reinterpret_cast(output->buffer())); + } + + template +@@ -149,94 +96,33 @@ void compareScalar(const IPortableTensor *lhs, const IPortableTensor *rhs, IPort + { + bool requires_broadcast = !HaveSameShapes(lhs, rhs); + +- if (requires_broadcast) +- { +- switch (op_type) +- { +- case OpType::Equal: +- Broadcast4DSlowEqual( +- getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), +- getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), +- getExtendedTensorShape(output), reinterpret_cast(output->buffer())); +- break; +- case OpType::NotEqual: +- Broadcast4DSlowNotEqual( +- getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), +- getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), +- getExtendedTensorShape(output), reinterpret_cast(output->buffer())); +- break; +- case OpType::Greater: +- Broadcast4DSlowGreater( +- getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), +- getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), +- getExtendedTensorShape(output), reinterpret_cast(output->buffer())); +- break; +- case OpType::GreaterEqual: +- Broadcast4DSlowGreaterEqual( +- getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), +- getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), +- getExtendedTensorShape(output), reinterpret_cast(output->buffer())); +- break; +- case OpType::Less: +- Broadcast4DSlowLess(getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), +- getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), +- getExtendedTensorShape(output), +- reinterpret_cast(output->buffer())); +- break; +- case OpType::LessEqual: +- Broadcast4DSlowLessEqual( +- getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), +- getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), +- getExtendedTensorShape(output), reinterpret_cast(output->buffer())); +- break; +- default: +- throw std::runtime_error{"Invalid OpType for CompareLayer"}; +- } +- } +- else // if (requires_broadcast == false) +- { +- switch (op_type) +- { +- case OpType::Equal: +- EqualNoScaling(getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), +- getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), +- getExtendedTensorShape(output), reinterpret_cast(output->buffer())); +- break; +- case OpType::NotEqual: +- NotEqualNoScaling(getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), +- getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), +- getExtendedTensorShape(output), +- reinterpret_cast(output->buffer())); +- break; +- case OpType::Greater: +- GreaterNoScaling(getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), +- getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), +- getExtendedTensorShape(output), +- reinterpret_cast(output->buffer())); +- break; +- case OpType::GreaterEqual: +- GreaterEqualNoScaling( +- getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), +- getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), +- getExtendedTensorShape(output), reinterpret_cast(output->buffer())); +- break; +- case OpType::Less: +- LessNoScaling(getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), +- getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), +- getExtendedTensorShape(output), reinterpret_cast(output->buffer())); +- break; +- case OpType::LessEqual: +- LessEqualNoScaling(getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), +- getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), +- getExtendedTensorShape(output), +- reinterpret_cast(output->buffer())); +- break; +- default: +- throw std::runtime_error{"Invalid OpType for CompareLayer"}; +- } +- } +- return; ++ using CompareFunction = ++ void (*)(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, ++ const T *input2_data, const Shape &output_shape, bool *output_data); ++ ++ static const CompareFunction broadcast_fns[] = { ++ Broadcast4DSlowEqual, Broadcast4DSlowNotEqual, Broadcast4DSlowGreater, ++ Broadcast4DSlowGreaterEqual, Broadcast4DSlowLess, Broadcast4DSlowLessEqual, ++ }; ++ static const CompareFunction non_broadcast_fns[] = { ++ EqualNoScaling, NotEqualNoScaling, GreaterNoScaling, ++ GreaterEqualNoScaling, LessNoScaling, LessEqualNoScaling, ++ }; ++ ++ static_assert(sizeof(broadcast_fns) == sizeof(non_broadcast_fns), ++ "Sizes of broadcast_fns and non_broadcast_fns must match!"); ++ ++ auto index = static_cast(op_type); ++ if (index < 0 || index >= static_cast(sizeof(broadcast_fns) / sizeof(broadcast_fns[0]))) ++ throw std::runtime_error{"Invalid OpType for CompareLayer"}; ++ ++ CompareFunction fn = (requires_broadcast ? broadcast_fns[index] : non_broadcast_fns[index]); ++ ++ fn(getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), ++ getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), ++ getExtendedTensorShape(output), reinterpret_cast(output->buffer())); + } ++ + } // namespace + + CompareLayer::CompareLayer() +diff --git a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc +index c00be64..ff22e32 100644 +--- a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc ++++ b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc +@@ -18,6 +18,7 @@ + + #include "../Tensor.h" + #include ++#include + + namespace onert + { +@@ -112,15 +113,32 @@ void FullyConnectedLayer::fullyConnectedHybrid() + getTensorShape(_bias), reinterpret_cast(_bias ? _bias->buffer() : nullptr), + getTensorShape(_output), reinterpret_cast(_output->buffer()), temp_arena); + +-// TODO Enable calling decrease_ref +-#if 0 ++// TODO Remove this ifdef ++#ifdef EXPERIMENTAL_RUY_FEATURE + if (_cached_weights == nullptr || _is_weights_freed) + return; + ++ // '_cached_weights is not nullptr and _is_weights_freed is false' means ++ // this weight shape is satisfied with the ruy kernel's prepack cache's condition. ++ // After entering here, it will not enter again except below the case - input is zero-vector ++ ++ // if input's elements are filled with zero, it by-passes(does not enter ruy-kernel path) ++ // so that handle this case ++ const int input_size = getTensorShape(_input).FlatSize(); ++ if (nnfw::cker::IsZeroVector(reinterpret_cast(_input->buffer()), input_size)) ++ return; ++ ++ // This weight tensor could be other ops' const tensor. ++ // Therefore, below reference should be checked like following + auto weight_tensor = dynamic_cast(_weights); + if (weight_tensor) + { + auto tensor = const_cast(weight_tensor); ++ if (tensor->buffer() == nullptr) // ref is already 0? ++ { ++ _is_weights_freed = true; ++ return; ++ } + + tensor->decrease_ref(); + if (tensor->buffer() == nullptr) // ref == 0? +@@ -128,7 +146,7 @@ void FullyConnectedLayer::fullyConnectedHybrid() + _is_weights_freed = true; + } + } +-#endif // if 0 ++#endif + #endif + } + +@@ -167,7 +185,17 @@ void FullyConnectedLayer::run() + + void FullyConnectedLayer::prepare() + { ++ if (_bias && _bias->is_constant()) ++ { ++ const int bias_size = getTensorShape(_bias).FlatSize(); ++ if (nnfw::cker::IsZeroVector(reinterpret_cast(_bias->buffer()), bias_size)) ++ { ++ _bias = nullptr; ++ } ++ } ++ + #ifdef USE_RUY_GEMV ++#ifdef EXPERIMENTAL_RUY_FEATURE + // TODO This is workaround + // The only fc hybrid will use ruy kernel + if (_input->data_type() != OperandType::FLOAT32 || +@@ -199,6 +227,7 @@ void FullyConnectedLayer::prepare() + } + } + #endif ++#endif + } + + } // namespace ops +diff --git a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h +index dd5ef24..e405b24 100644 +--- a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h ++++ b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h +@@ -72,6 +72,9 @@ private: + + #ifdef USE_RUY_GEMV + uint8_t *_cached_weights = nullptr; // weights to be cached and a key ++#ifdef EXPERIMENTAL_RUY_FEATURE ++ bool _is_weights_freed = false; // is weights freed? ++#endif + #endif + }; + +diff --git a/runtime/onert/backend/cpu/ops/L2NormLayer.cc b/runtime/onert/backend/cpu/ops/L2NormLayer.cc +new file mode 100644 +index 0000000..0d99b05 +--- /dev/null ++++ b/runtime/onert/backend/cpu/ops/L2NormLayer.cc +@@ -0,0 +1,71 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#include "L2NormLayer.h" ++ ++#include "OperationUtils.h" ++ ++#include ++#include ++ ++namespace onert ++{ ++namespace backend ++{ ++namespace cpu ++{ ++namespace ops ++{ ++ ++void L2NormLayer::configure(const IPortableTensor *input, IPortableTensor *output) ++{ ++ assert(input != nullptr); ++ assert(output != nullptr); ++ ++ _input = input; ++ _output = output; ++} ++ ++void L2NormLayer::run() ++{ ++ switch (_input->data_type()) ++ { ++ case OperandType::FLOAT32: ++ nnfw::cker::L2NormalizeFloat32( ++ getTensorShape(_input), reinterpret_cast(_input->buffer()), ++ getTensorShape(_output), reinterpret_cast(_output->buffer())); ++ break; ++ ++ case OperandType::QUANT_UINT8_ASYMM: ++ { ++ nnfw::cker::L2NormParams params; ++ assert(_input->data_offset() == 128); ++ params.input_zero_point = _input->data_offset(); ++ nnfw::cker::L2NormalizeQuant8( ++ params, getTensorShape(_input), reinterpret_cast(_input->buffer()), ++ getTensorShape(_output), reinterpret_cast(_output->buffer())); ++ } ++ break; ++ ++ default: ++ throw std::runtime_error{"L2Norm: Unsupported data type"}; ++ } ++} ++ ++} // namespace ops ++} // namespace cpu ++} // namespace backend ++} // namespace onert +diff --git a/runtime/onert/backend/cpu/ops/L2NormLayer.h b/runtime/onert/backend/cpu/ops/L2NormLayer.h +new file mode 100644 +index 0000000..63f2d11 +--- /dev/null ++++ b/runtime/onert/backend/cpu/ops/L2NormLayer.h +@@ -0,0 +1,55 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in riting, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#ifndef __ONERT_BACKEND_CPU_OPS_L2NORM_LAYER_H__ ++#define __ONERT_BACKEND_CPU_OPS_L2NORM_LAYER_H__ ++ ++#include ++ ++#include ++ ++namespace onert ++{ ++namespace backend ++{ ++namespace cpu ++{ ++namespace ops ++{ ++class L2NormLayer : public ::onert::exec::IFunction ++{ ++public: ++ L2NormLayer() : _input(nullptr), _output(nullptr) ++ { ++ // Nothing ++ } ++ ++public: ++ void configure(const IPortableTensor *_input, IPortableTensor *output); ++ ++ void run() override; ++ ++private: ++ const IPortableTensor *_input; ++ IPortableTensor *_output; ++}; ++ ++} // namespace ops ++} // namespace cpu ++} // namespace backend ++} // namespace onert ++ ++#endif // __ONERT_BACKEND_CPU_OPS_L2NORM_LAYER_H__ +diff --git a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc +index d71e325..06dde4f 100644 +--- a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc ++++ b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc +@@ -49,8 +49,8 @@ void LogSoftMaxLayer::logsoftmaxQuant8() + // NYI + } + +-void LogSoftMaxLayer::configure(const Tensor *input, const float beta, const int axis, +- Tensor *output) ++void LogSoftMaxLayer::configure(const IPortableTensor *input, const float beta, const int axis, ++ IPortableTensor *output) + { + _input = input; + _output = output; +diff --git a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h +index bc145ce..ba9deca 100644 +--- a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h ++++ b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h +@@ -40,13 +40,14 @@ public: + + void logsoftmaxQuant8(); + +- void configure(const Tensor *input, const float beta, const int axis, Tensor *output); ++ void configure(const IPortableTensor *input, const float beta, const int axis, ++ IPortableTensor *output); + + void run(); + + private: +- const Tensor *_input; +- Tensor *_output; ++ const IPortableTensor *_input; ++ IPortableTensor *_output; + + float _beta; + int _axis; +diff --git a/runtime/onert/backend/cpu/ops/OperationUtils.h b/runtime/onert/backend/cpu/ops/OperationUtils.h +index 8d29374..9838552 100644 +--- a/runtime/onert/backend/cpu/ops/OperationUtils.h ++++ b/runtime/onert/backend/cpu/ops/OperationUtils.h +@@ -52,6 +52,17 @@ union DataPtr { + void *v; + }; + ++union ConstDataPtr { ++ const uint8_t *u8; ++ const int8_t *i8; ++ const uint32_t *u32; ++ const int32_t *i32; ++ const bool *b; ++ const float *f; ++ const int64_t *i64; ++ const void *v; ++}; ++ + uint32_t getNumberOfDimensions(const IPortableTensor *tensor); + + uint32_t getNumberOfElements(const IPortableTensor *tensor); +diff --git a/runtime/onert/backend/cpu/ops/PadLayer.cc b/runtime/onert/backend/cpu/ops/PadLayer.cc +index fcfcf7b..6a2bf9d 100644 +--- a/runtime/onert/backend/cpu/ops/PadLayer.cc ++++ b/runtime/onert/backend/cpu/ops/PadLayer.cc +@@ -33,33 +33,40 @@ PadLayer::PadLayer() + // DO NOTHING + } + +-void PadLayer::padFloat32() ++template void PadLayer::padImpl(const T *constant_value_data) + { +- nnfw::cker::Pad(_padData, _padRank, getTensorShape(_input), +- reinterpret_cast(_input->buffer()), getTensorShape(_output), +- reinterpret_cast(_output->buffer()), _constantValueData.f); ++ nnfw::cker::Pad(_padData, _padRank, getTensorShape(_input), ++ reinterpret_cast(_input->buffer()), getTensorShape(_output), ++ reinterpret_cast(_output->buffer()), constant_value_data); + } +-void PadLayer::padQuant8() { throw std::runtime_error("Quantized Pad isn't supported NYI"); } + + void PadLayer::configure(const IPortableTensor *input, IPortableTensor *output, +- const int32_t *padData, int32_t padRank, uint8_t *constantValueData) ++ const int32_t *padData, int32_t padRank, const void *constantValueData) + { + _input = input; + _output = output; + memcpy(_padData, padData, sizeof(_padData)); + _padRank = padRank; +- _constantValueData.u8 = constantValueData; ++ _constantValueData.v = constantValueData; + } + + void PadLayer::run() + { + if (_input->data_type() == OperandType::FLOAT32) + { +- padFloat32(); ++ padImpl(_constantValueData.f); + } + else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM) + { +- padQuant8(); ++ if (_constantValueData.u8 == nullptr) ++ { ++ uint8_t pad_value = static_cast(_output->data_offset()); ++ padImpl(&pad_value); ++ } ++ else ++ { ++ padImpl(_constantValueData.u8); ++ } + } + else + { +diff --git a/runtime/onert/backend/cpu/ops/PadLayer.h b/runtime/onert/backend/cpu/ops/PadLayer.h +index 85bd2e6..efd73d5 100644 +--- a/runtime/onert/backend/cpu/ops/PadLayer.h ++++ b/runtime/onert/backend/cpu/ops/PadLayer.h +@@ -39,12 +39,10 @@ public: + PadLayer(); + + public: +- void padFloat32(); +- +- void padQuant8(); ++ template void padImpl(const T *constant_value_data); + + void configure(const IPortableTensor *input, IPortableTensor *output, const int32_t *padData, +- int32_t padRank, uint8_t *constantValueData = nullptr); ++ int32_t padRank, const void *constantValueData = nullptr); + + void run() override; + +@@ -54,7 +52,7 @@ private: + + int32_t _padData[8]; + int32_t _padRank; +- DataPtr _constantValueData; ++ ConstDataPtr _constantValueData; + }; + + } // namespace ops +diff --git a/runtime/onert/backend/cpu/ops/QuantizeLayer.cc b/runtime/onert/backend/cpu/ops/QuantizeLayer.cc +new file mode 100644 +index 0000000..45fc148 +--- /dev/null ++++ b/runtime/onert/backend/cpu/ops/QuantizeLayer.cc +@@ -0,0 +1,63 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#include "QuantizeLayer.h" ++ ++#include ++ ++namespace onert ++{ ++namespace backend ++{ ++namespace cpu ++{ ++namespace ops ++{ ++ ++QuantizeLayer::QuantizeLayer() : _input(nullptr), _output(nullptr) ++{ ++ // DO NOTHING ++} ++ ++template void QuantizeLayer::affineQuantize() ++{ ++ nnfw::cker::Quantize(getTensorShape(_input), reinterpret_cast(_input->buffer()), ++ getTensorShape(_output), reinterpret_cast(_output->buffer()), ++ _output->data_scale(), _output->data_offset()); ++} ++ ++void QuantizeLayer::configure(const IPortableTensor *input, IPortableTensor *output) ++{ ++ _input = input; ++ _output = output; ++} ++ ++void QuantizeLayer::run() ++{ ++ if (_input->data_type() == OperandType::FLOAT32) ++ { ++ affineQuantize(); ++ } ++ else ++ { ++ throw std::runtime_error{"Quantize: unsupported data type"}; ++ } ++} ++ ++} // namespace ops ++} // namespace cpu ++} // namespace backend ++} // namespace onert +diff --git a/runtime/onert/backend/cpu/ops/QuantizeLayer.h b/runtime/onert/backend/cpu/ops/QuantizeLayer.h +new file mode 100644 +index 0000000..b4e7aca +--- /dev/null ++++ b/runtime/onert/backend/cpu/ops/QuantizeLayer.h +@@ -0,0 +1,56 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#ifndef __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__ ++#define __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__ ++ ++#include ++#include "OperationUtils.h" ++ ++#include ++ ++namespace onert ++{ ++namespace backend ++{ ++namespace cpu ++{ ++namespace ops ++{ ++ ++class QuantizeLayer : public ::onert::exec::IFunction ++{ ++public: ++ QuantizeLayer(); ++ ++public: ++ template void affineQuantize(); ++ ++ void configure(const IPortableTensor *input, IPortableTensor *output); ++ ++ void run() override; ++ ++private: ++ const IPortableTensor *_input; ++ IPortableTensor *_output; ++}; ++ ++} // namespace ops ++} // namespace cpu ++} // namespace backend ++} // namespace onert ++ ++#endif // __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__ +diff --git a/runtime/onert/backend/cpu/ops/SliceLayer.cc b/runtime/onert/backend/cpu/ops/SliceLayer.cc +index a9106c1..449c073 100644 +--- a/runtime/onert/backend/cpu/ops/SliceLayer.cc ++++ b/runtime/onert/backend/cpu/ops/SliceLayer.cc +@@ -46,7 +46,7 @@ void SliceLayer::GetBeginAndSizeVectors(int dimensions, const IPortableTensor *b + } + } + +-void SliceLayer::sliceFloat32() ++template void SliceLayer::sliceImpl() + { + const int kMaxDim = nnfw::cker::Shape::kMaxSmallSize; + +@@ -74,14 +74,8 @@ void SliceLayer::sliceFloat32() + } + + nnfw::cker::Slice(op_params, getExtendedTensorShape(_input), +- reinterpret_cast(_input->buffer()), +- reinterpret_cast(_output->buffer())); +-} +- +-void SliceLayer::sliceQuant8() +-{ +- // cker quant8 slice is not implemented yet +- throw std::runtime_error{"NYI"}; ++ reinterpret_cast(_input->buffer()), ++ reinterpret_cast(_output->buffer())); + } + + void SliceLayer::configure(const IPortableTensor *input, const IPortableTensor *begin, +@@ -97,11 +91,11 @@ void SliceLayer::run() + { + if (_input->data_type() == OperandType::FLOAT32) + { +- sliceFloat32(); ++ sliceImpl(); + } + else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM) + { +- sliceQuant8(); ++ sliceImpl(); + } + else + { +diff --git a/runtime/onert/backend/cpu/ops/SliceLayer.h b/runtime/onert/backend/cpu/ops/SliceLayer.h +index 9945d7e..650e2c9 100644 +--- a/runtime/onert/backend/cpu/ops/SliceLayer.h ++++ b/runtime/onert/backend/cpu/ops/SliceLayer.h +@@ -42,8 +42,7 @@ public: + void run() override; + + private: +- void sliceFloat32(); +- void sliceQuant8(); ++ template void sliceImpl(); + + template + void GetBeginAndSizeVectors(int dimensions, const IPortableTensor *begin, +diff --git a/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc +new file mode 100644 +index 0000000..110b0bc +--- /dev/null ++++ b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc +@@ -0,0 +1,70 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#include "SpaceToDepthLayer.h" ++ ++#include "OperationUtils.h" ++ ++#include ++ ++namespace onert ++{ ++namespace backend ++{ ++namespace cpu ++{ ++namespace ops ++{ ++SpaceToDepthLayer::SpaceToDepthLayer() : _input(nullptr), _block_size(0), _output(nullptr) ++{ ++ // DO NOTHING ++} ++ ++template void SpaceToDepthLayer::spaceToDepth() ++{ ++ ++ nnfw::cker::SpaceToDepthParams params; ++ params.block_size = _block_size; ++ ++ nnfw::cker::SpaceToDepth(params, getTensorShape(_input), ++ reinterpret_cast(_input->buffer()), ++ getTensorShape(_output), reinterpret_cast(_output->buffer())); ++} ++ ++void SpaceToDepthLayer::configure(const IPortableTensor *input, const int32_t block_size, ++ IPortableTensor *output) ++{ ++ _input = input; ++ _block_size = block_size; ++ _output = output; ++} ++ ++void SpaceToDepthLayer::run() ++{ ++ if (_input->data_type() == OperandType::FLOAT32) ++ { ++ spaceToDepth(); ++ } ++ else ++ { ++ throw std::runtime_error{"SpaceToDepth: unsupported data type"}; ++ } ++} ++ ++} // namespace ops ++} // namespace cpu ++} // namespace backend ++} // namespace onert +diff --git a/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h +new file mode 100644 +index 0000000..c11ef2b +--- /dev/null ++++ b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h +@@ -0,0 +1,54 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in riting, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#ifndef __ONERT_BACKEND_CPU_OPS_SPACE_TO_DEPTH_LAYER_H__ ++#define __ONERT_BACKEND_CPU_OPS_SPACE_TO_DEPTH_LAYER_H__ ++ ++#include ++ ++#include ++ ++namespace onert ++{ ++namespace backend ++{ ++namespace cpu ++{ ++namespace ops ++{ ++class SpaceToDepthLayer : public ::onert::exec::IFunction ++{ ++public: ++ SpaceToDepthLayer(); ++ ++ void configure(const IPortableTensor *input, const int32_t block_size, IPortableTensor *output); ++ ++ void run() override; ++ ++private: ++ template void spaceToDepth(); ++ ++ const IPortableTensor *_input; ++ int32_t _block_size; ++ IPortableTensor *_output; ++}; ++ ++} // namespace ops ++} // namespace cpu ++} // namespace backend ++} // namespace onert ++ ++#endif // __ONERT_BACKEND_CPU_OPS_SPACE_TO_BATCH_ND_LAYER_H__ +diff --git a/runtime/onert/core/include/backend/ITensorBuilder.h b/runtime/onert/core/include/backend/ITensorBuilder.h +index a49525b..b760cda 100644 +--- a/runtime/onert/core/include/backend/ITensorBuilder.h ++++ b/runtime/onert/core/include/backend/ITensorBuilder.h +@@ -112,12 +112,12 @@ public: // methods for static tensor allocation + virtual std::shared_ptr tensorAt(const ir::OperandIndex &ind) = 0; + + /** +- * @brief Set the External Tensor object ++ * @brief Set the migrant tensor object + * + * @return true if succeeded + * @return false if failed or unsupported + */ +- virtual bool setExternalTensor(const ir::OperandIndex &, const std::shared_ptr &) ++ virtual bool setMigrantTensor(const ir::OperandIndex &, const std::shared_ptr &) + { + return false; + } +diff --git a/runtime/onert/core/include/backend/ITensorRegistry.h b/runtime/onert/core/include/backend/ITensorRegistry.h +index f5a95f4..8555131 100644 +--- a/runtime/onert/core/include/backend/ITensorRegistry.h ++++ b/runtime/onert/core/include/backend/ITensorRegistry.h +@@ -35,17 +35,22 @@ struct ITensorRegistry + virtual ~ITensorRegistry() = default; + + /** +- * @brief Returns pointer of ITensor among managed and external tensors ++ * @brief Returns pointer of ITensor among native and migrant tensors ++ * ++ * Native Tensor is a tensor that is managed by this backend ++ * Migrant Tensor is a tensor that is imported from another backend ++ * + * @note Return tensor cannot be used longer than dynamic tensor manager + */ + virtual std::shared_ptr getITensor(const ir::OperandIndex &) = 0; + /** +- * @brief Returns pointer of ITensor among managed tensors ++ * @brief Returns pointer of ITensor among native tensors + * +- * Unlike @c getITensor , this function only searches from managed tensors +- * @note Return tensor cannot be used longer than dynamic tensor manager ++ * Unlike @c getITensor , this function only searches from native tensors ++ * ++ * @note Returned tensor cannot be used longer than dynamic tensor manager + */ +- virtual std::shared_ptr getManagedITensor(const ir::OperandIndex &) = 0; ++ virtual std::shared_ptr getNativeITensor(const ir::OperandIndex &) = 0; + }; + + } // namespace backend +@@ -73,68 +78,67 @@ public: + std::shared_ptr getITensor(const ir::OperandIndex &ind) override + { + static_assert(std::is_base_of::value, "T_Tensor must derive from ITensor."); +- auto external_tensor = _external.find(ind); +- if (external_tensor != _external.end()) ++ auto external_tensor = _migrant.find(ind); ++ if (external_tensor != _migrant.end()) + return external_tensor->second; +- return getManagedTensor(ind); ++ return getNativeTensor(ind); + } + +- std::shared_ptr getManagedITensor(const ir::OperandIndex &ind) override ++ std::shared_ptr getNativeITensor(const ir::OperandIndex &ind) override + { +- return getManagedTensor(ind); ++ return getNativeTensor(ind); + } + + std::shared_ptr getPortableTensor(const ir::OperandIndex &ind) + { +- auto external_tensor = _external.find(ind); +- if (external_tensor != _external.end()) ++ auto external_tensor = _migrant.find(ind); ++ if (external_tensor != _migrant.end()) + { + if (external_tensor->second) + return external_tensor->second; + } +- return getManagedTensor(ind); ++ return getNativeTensor(ind); + } + +- std::shared_ptr getManagedTensor(const ir::OperandIndex &ind) ++ std::shared_ptr getNativeTensor(const ir::OperandIndex &ind) + { +- auto tensor = _managed.find(ind); +- if (tensor != _managed.end()) ++ auto tensor = _native.find(ind); ++ if (tensor != _native.end()) + return tensor->second; + return nullptr; + } + +- bool setExternalTensor(const ir::OperandIndex &ind, +- const std::shared_ptr &tensor) ++ bool setMigrantTensor(const ir::OperandIndex &ind, const std::shared_ptr &tensor) + { + // TODO Uncomment this as two tensors for an index is not allowed. + // But now it is temporarily allowed as a workaround. External one hides Managed one. +- // auto itr = _managed.find(ind); +- // if (itr != _managed.end() && itr->second != nullptr && tensor != nullptr) ++ // auto itr = _native.find(ind); ++ // if (itr != _native.end() && itr->second != nullptr && tensor != nullptr) + // throw std::runtime_error{ +- // "Tried to set an external tensor but an managed tensor already exists."}; +- _external[ind] = tensor; ++ // "Tried to set an migrant tensor but an native tensor already exists."}; ++ _migrant[ind] = tensor; + return true; + } + +- void setManagedTensor(const ir::OperandIndex &ind, const std::shared_ptr &tensor) ++ void setNativeTensor(const ir::OperandIndex &ind, const std::shared_ptr &tensor) + { +- auto itr = _external.find(ind); +- if (itr != _external.end() && itr->second != nullptr && tensor != nullptr) ++ auto itr = _migrant.find(ind); ++ if (itr != _migrant.end() && itr->second != nullptr && tensor != nullptr) + throw std::runtime_error{ +- "Tried to set a managed tensor but an external tensor already exists."}; +- _managed[ind] = tensor; ++ "Tried to set a native tensor but an migrant tensor already exists."}; ++ _native[ind] = tensor; + } + +- const ir::OperandIndexMap> &managed_tensors() { return _managed; } ++ const ir::OperandIndexMap> &native_tensors() { return _native; } + +- const ir::OperandIndexMap> &external_tensors() ++ const ir::OperandIndexMap> &migrant_tensors() + { +- return _external; ++ return _migrant; + } + + private: +- ir::OperandIndexMap> _external; +- ir::OperandIndexMap> _managed; ++ ir::OperandIndexMap> _migrant; ++ ir::OperandIndexMap> _native; + }; + + } // namespace backend +diff --git a/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h b/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h +index 6ddacc7..a7e034a 100644 +--- a/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h ++++ b/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h +@@ -19,7 +19,7 @@ + + #include "MemoryManager.h" + +-#include "backend/ITensorManager.h" ++#include "backend/IStaticTensorManager.h" + #include "ir/OperandIndexMap.h" + #include "ir/OperandInfo.h" + #include "TensorRegistry.h" +@@ -31,7 +31,7 @@ namespace backend + namespace cpu_common + { + +-class StaticTensorManager : public backend::ITensorManager ++class StaticTensorManager : public backend::IStaticTensorManager + { + public: + StaticTensorManager(const std::shared_ptr ®); +diff --git a/runtime/onert/core/include/compiler/StaticShapeInference.h b/runtime/onert/core/include/compiler/StaticShapeInference.h +index 379143b..b3391a3 100644 +--- a/runtime/onert/core/include/compiler/StaticShapeInference.h ++++ b/runtime/onert/core/include/compiler/StaticShapeInference.h +@@ -99,6 +99,7 @@ private: + void visit(const ir::operation::LogicalNot &op) override; + void visit(const ir::operation::LogicalOr &op) override; + void visit(const ir::operation::Logistic &op) override; ++ void visit(const ir::operation::L2Normalization &op) override; + void visit(const ir::operation::MatrixBandPart &op) override; + void visit(const ir::operation::Max &op) override; + void visit(const ir::operation::Min &op) override; +diff --git a/runtime/onert/core/include/exec/DynamicShapeInference.h b/runtime/onert/core/include/exec/DynamicShapeInference.h +index 113c348..601c1bf 100644 +--- a/runtime/onert/core/include/exec/DynamicShapeInference.h ++++ b/runtime/onert/core/include/exec/DynamicShapeInference.h +@@ -72,6 +72,7 @@ public: + void visit(const ir::operation::LogicalNot &op) override; + void visit(const ir::operation::LogicalOr &op) override; + void visit(const ir::operation::Logistic &op) override; ++ void visit(const ir::operation::L2Normalization &op) override; + void visit(const ir::operation::MatrixBandPart &op) override; + void visit(const ir::operation::Max &op) override; + void visit(const ir::operation::Min &op) override; +diff --git a/runtime/onert/core/include/ir/Operations.Include.h b/runtime/onert/core/include/ir/Operations.Include.h +index 5fac54e..e3b5d19 100644 +--- a/runtime/onert/core/include/ir/Operations.Include.h ++++ b/runtime/onert/core/include/ir/Operations.Include.h +@@ -103,3 +103,4 @@ + #include "ir/operation/BatchMatMul.h" + #include "ir/operation/FusedBatchNorm.h" + #include "ir/operation/LogSoftmax.h" ++#include "ir/operation/Quantize.h" +diff --git a/runtime/onert/core/include/ir/Operations.lst b/runtime/onert/core/include/ir/Operations.lst +index 9d0642f..03a2aa2 100644 +--- a/runtime/onert/core/include/ir/Operations.lst ++++ b/runtime/onert/core/include/ir/Operations.lst +@@ -106,3 +106,4 @@ OP(MatrixBandPart) + OP(BatchMatMul) + OP(FusedBatchNorm) + OP(LogSoftmax) ++OP(Quantize) +diff --git a/runtime/onert/core/include/ir/operation/LogSoftmax.h b/runtime/onert/core/include/ir/operation/LogSoftmax.h +index 26a92d7..391b4ba 100644 +--- a/runtime/onert/core/include/ir/operation/LogSoftmax.h ++++ b/runtime/onert/core/include/ir/operation/LogSoftmax.h +@@ -48,7 +48,7 @@ public: + + public: + void accept(OperationVisitor &v) const override; +- OpCode opcode() const final { return OpCode::Softmax; } ++ OpCode opcode() const final { return OpCode::LogSoftmax; } + + public: + const Param ¶m() const { return _param; } +diff --git a/runtime/onert/core/include/ir/operation/Pad.h b/runtime/onert/core/include/ir/operation/Pad.h +index a486061..00481cd 100644 +--- a/runtime/onert/core/include/ir/operation/Pad.h ++++ b/runtime/onert/core/include/ir/operation/Pad.h +@@ -33,7 +33,7 @@ public: + { + INPUT = 0, + PAD = 1, +- // VALUE = 2 Not allow padding value operand yet ++ VALUE = 2 + }; + + public: +diff --git a/runtime/onert/core/include/ir/operation/Quantize.h b/runtime/onert/core/include/ir/operation/Quantize.h +new file mode 100644 +index 0000000..2533ce4 +--- /dev/null ++++ b/runtime/onert/core/include/ir/operation/Quantize.h +@@ -0,0 +1,49 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#ifndef __ONERT_IR_OPERATION_QUANTIZE_H__ ++#define __ONERT_IR_OPERATION_QUANTIZE_H__ ++ ++#include "ir/Operation.h" ++ ++namespace onert ++{ ++namespace ir ++{ ++namespace operation ++{ ++ ++class Quantize : public Operation ++{ ++public: ++ enum Input ++ { ++ INPUT = 0, ++ }; ++ ++public: ++ Quantize(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs); ++ ++public: ++ void accept(OperationVisitor &v) const override; ++ OpCode opcode() const final { return OpCode::Quantize; } ++}; ++ ++} // namespace operation ++} // namespace ir ++} // namespace onert ++ ++#endif // __ONERT_IR_OPERATION_QUANTIZE_H__ +diff --git a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc +index 32a8041..c374aba 100644 +--- a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc ++++ b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc +@@ -36,7 +36,7 @@ DynamicTensorManager::DynamicTensorManager(const std::shared_ptrgetManagedTensor(ind); ++ auto user_tensor = _user_tensors->getNativeTensor(ind); + if (user_tensor) + { + // User tensors cannot be reallocated. +@@ -47,8 +47,8 @@ void DynamicTensorManager::applyShape(const ir::OperandIndex &ind, const ir::Sha + user_tensor->setShape(new_shape); + } + +- // NOTE Then handle managed tensors +- auto tensor = _tensors->getManagedTensor(ind); ++ // NOTE Then handle native tensors ++ auto tensor = _tensors->getNativeTensor(ind); + assert(tensor); + + bool previously_dynamic = tensor->is_dynamic(); +@@ -101,9 +101,9 @@ void DynamicTensorManager::buildTensor(const ir::OperandIndex &ind, + const ir::OperandInfo &tensor_info, + ir::Layout backend_layout) + { +- assert(_tensors->getManagedTensor(ind) == nullptr); ++ assert(_tensors->getNativeTensor(ind) == nullptr); + auto tensor = std::make_shared(tensor_info, backend_layout); +- _tensors->setManagedTensor(ind, tensor); ++ _tensors->setNativeTensor(ind, tensor); + } + + void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind) +@@ -130,7 +130,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind) + auto &input_set = find->second; + for (auto input_ind : input_set) + { +- if (!_tensors->getManagedTensor(input_ind)->is_dynamic()) ++ if (!_tensors->getNativeTensor(input_ind)->is_dynamic()) + continue; + + _dynamic_mem_mgr->deallocate(input_ind); +@@ -141,7 +141,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind) + + void DynamicTensorManager::deallocSubgraphOutput(ir::OperandIndex output_ind) + { +- if (!_tensors->getManagedTensor(output_ind)->is_dynamic()) ++ if (!_tensors->getNativeTensor(output_ind)->is_dynamic()) + return; + + _dynamic_mem_mgr->deallocate(output_ind); +diff --git a/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc b/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc +index 4b683fb..eb83b7d 100644 +--- a/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc ++++ b/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc +@@ -81,23 +81,23 @@ void KernelGenerator::visit(const ir::operation::If &node) + std::vector> input_tensors; + for (const auto input_index : node.getInputs()) + { +- auto input_alloc = getTensor(input_index); ++ auto input_tensor = getTensor(input_index); + +- input_tensors.emplace_back(input_alloc); ++ input_tensors.emplace_back(input_tensor); + } + + std::vector> output_tensors; + exec::DynAllocInfoMap outputs_dyn_alloc_info; + for (const auto output_index : node.getOutputs()) + { +- auto output_alloc = getTensor(output_index); ++ auto output_tensor = getTensor(output_index); + +- output_tensors.emplace_back(output_alloc); ++ output_tensors.emplace_back(output_tensor); + const auto output_tensor_builder = getTensorBuilder(output_index); + if (output_tensor_builder->supportDynamicTensor()) + { + auto output_dyn_manager = output_tensor_builder->dynamicTensorManager(); +- outputs_dyn_alloc_info[output_alloc] = exec::DynAllocInfo{output_index, output_dyn_manager}; ++ outputs_dyn_alloc_info[output_tensor] = exec::DynAllocInfo{output_index, output_dyn_manager}; + } + } + +@@ -146,24 +146,24 @@ void KernelGenerator::visit(const ir::operation::While &node) + std::vector> input_tensors; + for (const auto input_index : node.getInputs()) + { +- auto input_alloc = getTensor(input_index); ++ auto input_tensor = getTensor(input_index); + +- input_tensors.emplace_back(input_alloc); ++ input_tensors.emplace_back(input_tensor); + } + + std::vector> output_tensors; + std::unordered_map, exec::DynAllocInfo> outputs_dyn_alloc_info; + for (const auto output_index : node.getOutputs()) + { +- auto output_alloc = getTensor(output_index); ++ auto output_tensor = getTensor(output_index); + +- output_tensors.emplace_back(output_alloc); ++ output_tensors.emplace_back(output_tensor); + + const auto output_tensor_builder = getTensorBuilder(output_index); + if (output_tensor_builder->supportDynamicTensor()) + { + auto output_dyn_manager = output_tensor_builder->dynamicTensorManager(); +- outputs_dyn_alloc_info[output_alloc] = exec::DynAllocInfo{output_index, output_dyn_manager}; ++ outputs_dyn_alloc_info[output_tensor] = exec::DynAllocInfo{output_index, output_dyn_manager}; + } + } + +@@ -199,7 +199,7 @@ KernelGenerator::getTensorBuilder(const ir::OperandIndex &index) + for (auto tensor_builder : _tensor_builder_set) + { + auto reg = tensor_builder->tensorRegistry(); +- auto tensor = reg ? reg->getManagedITensor(index) : tensor_builder->tensorAt(index); ++ auto tensor = reg ? reg->getNativeITensor(index) : tensor_builder->tensorAt(index); + if (tensor) + { + ret = tensor_builder; +diff --git a/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc b/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc +index 16cd3ec..5bddb91 100644 +--- a/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc ++++ b/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc +@@ -92,7 +92,7 @@ void TensorBuilder::allocate() + std::shared_ptr TensorBuilder::tensorAt(const ir::OperandIndex &ind) + { + // NOTE Find from User Tensor Registry first +- // FIXME There may be both user tensor and managed tensor for a `ind` which is a waste ++ // FIXME There may be both user tensor and native tensor for a `ind` which is a waste + auto user_tensor = _user_tensor_reg->getITensor(ind); + auto tensor = _tensor_reg->getITensor(ind); + if (user_tensor) +@@ -107,7 +107,7 @@ void TensorBuilder::iterate(const IterateFunction &fn) { _static_tensor_mgr->ite + + std::shared_ptr TensorBuilder::at(const ir::OperandIndex &ind) + { +- return _tensor_reg->getManagedTensor(ind); ++ return _tensor_reg->getNativeTensor(ind); + } + + std::unique_ptr TensorBuilder::releaseStaticTensorManager(void) +@@ -123,7 +123,7 @@ std::unique_ptr TensorBuilder::releaseDynamicTensorManager(void) + void TensorBuilder::setUserTensor(const ir::OperandIndex &ind, + const std::shared_ptr &tensor) + { +- _user_tensor_reg->setManagedTensor(ind, tensor); ++ _user_tensor_reg->setNativeTensor(ind, tensor); + } + + } // namespace controlflow +diff --git a/runtime/onert/core/src/backend/controlflow/UserTensor.h b/runtime/onert/core/src/backend/controlflow/UserTensor.h +index ce94ea0..b9b2d52 100644 +--- a/runtime/onert/core/src/backend/controlflow/UserTensor.h ++++ b/runtime/onert/core/src/backend/controlflow/UserTensor.h +@@ -68,6 +68,7 @@ public: + void set_dynamic() override { _dynamic = true; } + ir::Shape getShape() const override { return _info.shape(); } + void setShape(const ir::Shape &new_shape) override { _info.shape(new_shape); } ++ bool is_constant() const override { return false; } + + private: + ir::OperandInfo _info; +diff --git a/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc b/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc +index 0ccf700..ede403b 100644 +--- a/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc ++++ b/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc +@@ -35,7 +35,7 @@ void DynamicTensorManager::applyShape(const ir::OperandIndex &ind, const ir::Sha + { + VERBOSE_F() << ind << std::endl; + +- auto tensor = _tensors->getManagedTensor(ind); ++ auto tensor = _tensors->getNativeTensor(ind); + assert(tensor); + + bool previously_dynamic = tensor->is_dynamic(); +@@ -88,9 +88,9 @@ void DynamicTensorManager::buildTensor(const ir::OperandIndex &ind, + const ir::OperandInfo &tensor_info, + ir::Layout backend_layout) + { +- assert(_tensors->getManagedTensor(ind) == nullptr); ++ assert(_tensors->getNativeTensor(ind) == nullptr); + auto tensor = std::make_shared(tensor_info, backend_layout); +- _tensors->setManagedTensor(ind, tensor); ++ _tensors->setNativeTensor(ind, tensor); + } + + void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind) +@@ -117,7 +117,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind) + auto &input_set = find->second; + for (auto input_ind : input_set) + { +- auto *tensor = _tensors->getManagedTensor(input_ind).get(); ++ auto *tensor = _tensors->getNativeTensor(input_ind).get(); + if (!tensor->is_dynamic()) + continue; + +@@ -131,7 +131,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind) + + void DynamicTensorManager::deallocSubgraphOutput(ir::OperandIndex output_ind) + { +- auto *tensor = _tensors->getManagedTensor(output_ind).get(); ++ auto *tensor = _tensors->getNativeTensor(output_ind).get(); + if (!tensor->is_dynamic()) + return; + +diff --git a/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc b/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc +index 47bea35..8604542 100644 +--- a/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc ++++ b/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc +@@ -33,7 +33,7 @@ StaticTensorManager::StaticTensorManager(const std::shared_ptr & + + void StaticTensorManager::allocateConsts(void) + { +- for (auto &pair : _tensors->managed_tensors()) ++ for (auto &pair : _tensors->native_tensors()) + { + const auto &ind = pair.first; + auto tensor = pair.second; +@@ -42,9 +42,9 @@ void StaticTensorManager::allocateConsts(void) + auto mem_alloc = _const_mgr->allocate(ind, tensor->total_size()); + tensor->setBuffer(mem_alloc); + auto buffer = mem_alloc->base(); +- VERBOSE(CPU_StaticTensorManager) << "CONSTANT TENSOR(#" << ind.value() +- << "): " << static_cast(buffer) +- << "size : " << tensor->total_size() << std::endl; ++ VERBOSE(CPU_COMMON_StaticTensorManager) << "CONSTANT TENSOR(#" << ind.value() ++ << "): " << static_cast(buffer) ++ << "size : " << tensor->total_size() << std::endl; + } + } + } +@@ -53,7 +53,7 @@ void StaticTensorManager::allocateNonconsts(void) + { + _nonconst_mgr->allocate(); + +- for (auto &pair : _tensors->managed_tensors()) ++ for (auto &pair : _tensors->native_tensors()) + { + const auto &ind = pair.first; + auto tensor = pair.second; +@@ -62,8 +62,8 @@ void StaticTensorManager::allocateNonconsts(void) + auto *buffer = _nonconst_mgr->getBuffer(ind); + tensor->setBuffer(buffer); + +- VERBOSE(CPU_StaticTensorManager) << "TENSOR(#" << ind.value() +- << "): " << static_cast(buffer) << std::endl; ++ VERBOSE(CPU_COMMON_StaticTensorManager) << "TENSOR(#" << ind.value() ++ << "): " << static_cast(buffer) << std::endl; + } + } + } +@@ -76,18 +76,18 @@ void StaticTensorManager::buildTensor(const ir::OperandIndex &ind, + const ir::OperandInfo &tensor_info, ir::Layout backend_layout, + bool as_const) + { +- assert(!_tensors->getManagedTensor(ind)); ++ assert(!_tensors->getNativeTensor(ind)); + auto tensor = std::make_shared(tensor_info, backend_layout); +- _tensors->setManagedTensor(ind, tensor); ++ _tensors->setNativeTensor(ind, tensor); + _as_constants[ind] = as_const; + } + + void StaticTensorManager::claimPlan(const ir::OperandIndex &ind, uint32_t size) + { +- assert(_tensors->getManagedTensor(ind)); ++ assert(_tensors->getNativeTensor(ind)); + + // This method is called only when a tensor has proper shape +- assert(!_tensors->getManagedTensor(ind)->is_dynamic()); ++ assert(!_tensors->getNativeTensor(ind)->is_dynamic()); + + if (!_as_constants[ind]) + _nonconst_mgr->claimPlan(ind, size); +@@ -95,10 +95,10 @@ void StaticTensorManager::claimPlan(const ir::OperandIndex &ind, uint32_t size) + + void StaticTensorManager::releasePlan(const ir::OperandIndex &ind) + { +- assert(_tensors->getManagedTensor(ind)); ++ assert(_tensors->getNativeTensor(ind)); + + // This method is called only when a tensor has proper shape +- assert(!_tensors->getManagedTensor(ind)->is_dynamic()); ++ assert(!_tensors->getNativeTensor(ind)->is_dynamic()); + + if (!_as_constants[ind]) + _nonconst_mgr->releasePlan(ind); +@@ -106,7 +106,7 @@ void StaticTensorManager::releasePlan(const ir::OperandIndex &ind) + + void StaticTensorManager::iterate(const std::function &fn) + { +- for (const auto &it : _tensors->managed_tensors()) ++ for (const auto &it : _tensors->native_tensors()) + fn(it.first); + } + +diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.cc b/runtime/onert/core/src/compiler/ExecutorFactory.cc +index f3f69ad..8439b6a 100644 +--- a/runtime/onert/core/src/compiler/ExecutorFactory.cc ++++ b/runtime/onert/core/src/compiler/ExecutorFactory.cc +@@ -201,18 +201,35 @@ ExecutorFactory::initializeModelIOTensors(ir::LoweredGraph &lowered_graph, + // Add tensor to controlflow TensorRegistry. + cf_tensor_builder->setUserTensor(ind, tensor); + ret.push_back(tensor); +- +- // Set other tensors as external tensors +- for (auto &tensor_builder : tensor_builders) +- { +- // FIXME This is a workaround registering all user tensors to all backends +- // FIXME Handle when it is failed +- tensor_builder->setExternalTensor(ind, tensor); +- } + } + return ret; + } + ++void ExecutorFactory::prepareExternalTensors(ir::LoweredGraph &lowered_graph, ++ TensorBuilders &tensor_builders) ++{ ++ lowered_graph.op_seqs().iterate( ++ [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) { ++ auto lower_info = lowered_graph.getLowerInfo(op_seq_index); ++ auto &backend_ctx = lowered_graph.backend_contexts().at(lower_info->backend()); ++ for (auto ind : (op_seq.getInputs() + op_seq.getOutputs()) | ir::Remove::DUPLICATED | ++ ir::Remove::UNDEFINED) ++ { ++ // If an OpSequence input/output tensor does not have a own tensor object, ++ // it must be using external tensors, so find the tensor from other tensor builders and ++ // set the tensor to this tensor builder if portable ++ if (!backend_ctx->tensor_builder->tensorAt(ind)) ++ { ++ auto tensor = tensor_builders.getITensor(ind); ++ assert(tensor); // The tensor must have been created in one of TensorBuilders ++ auto ptensor = std::dynamic_pointer_cast(tensor); ++ if (ptensor) ++ backend_ctx->tensor_builder->setMigrantTensor(ind, ptensor); ++ } ++ } ++ }); ++} ++ + exec::IExecutor * + ExecutorFactory::createLinearExecutor(std::unique_ptr lowered_graph, + const compiler::CompilerOptions &options, +@@ -265,6 +282,8 @@ ExecutorFactory::createLinearExecutor(std::unique_ptr lowered_ + tensor_builder->prepare(); + } + ++ prepareExternalTensors(*lowered_graph, tensor_builders); ++ + ExecutionBuilder builder; + + // Generate kernels +@@ -367,6 +386,8 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor( + tensor_builder->prepare(); + } + ++ prepareExternalTensors(*lowered_graph, tensor_builders); ++ + ExecutionBuilder builder; + + // Generate kernels +diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.h b/runtime/onert/core/src/compiler/ExecutorFactory.h +index 1e82b98..418e5a7 100644 +--- a/runtime/onert/core/src/compiler/ExecutorFactory.h ++++ b/runtime/onert/core/src/compiler/ExecutorFactory.h +@@ -22,6 +22,7 @@ + #include "backend/ITensor.h" + #include "exec/IExecutor.h" + #include "ir/LoweredGraph.h" ++#include "TensorBuilders.h" + + namespace onert + { +@@ -48,6 +49,8 @@ private: + static std::vector> + initializeModelIOTensors(ir::LoweredGraph &lowered_graph, + const ir::OperandIndexSequence &indices); ++ static void prepareExternalTensors(ir::LoweredGraph &lowered_graph, ++ TensorBuilders &tensor_builders); + static exec::IExecutor * + createLinearExecutor(std::unique_ptr lowered_graph, + const compiler::CompilerOptions &options, +diff --git a/runtime/onert/core/src/compiler/HEScheduler.h b/runtime/onert/core/src/compiler/HEScheduler.h +index f507539..d8ceca9 100644 +--- a/runtime/onert/core/src/compiler/HEScheduler.h ++++ b/runtime/onert/core/src/compiler/HEScheduler.h +@@ -51,16 +51,12 @@ public: + * @param[in] backend_resolver backend resolver + */ + HEScheduler(const backend::BackendContexts &backend_contexts, const CompilerOptions &options) +- : _backend_contexts{backend_contexts}, _is_supported{}, _backends_avail_time{}, _ops_eft{}, ++ : _is_supported{}, _backends_avail_time{}, _ops_eft{}, + _op_to_rank{std::make_shared>()}, + _is_profiling_mode{options.he_profiling_mode}, + _is_linear_exec{options.executor == "Linear"}, + _is_parallel_exec{options.executor == "Parallel"} + { +- // Workaround to avoid unused-private-field warning +- // TODO use _backend_contexts and remove workaround +- (void)_backend_contexts; +- + for (auto &entry : backend_contexts) + { + _all_backends.push_back(entry.first); +@@ -165,7 +161,6 @@ private: + // whether it should assign these backends to these nodes: + // * It stores false for unsupported nodes + // * During rank calculation with enabled profiling mode it stores true for supported nodes +- const backend::BackendContexts &_backend_contexts; + std::unordered_map> _is_supported; + // Finishing and starting time of each backend + std::unordered_map> _backends_avail_time; +@@ -175,8 +170,7 @@ private: + std::unique_ptr _backend_resolver; + std::unique_ptr _exec_time; + const ir::Graph *_graph{nullptr}; +- std::vector +- _all_backends; // TODO Remove this and use _backend_contexts instead ++ std::vector _all_backends; + const backend::Backend *_cpu_backend{nullptr}; // TODO Change this to controlflow_backend + bool _is_profiling_mode; + bool _is_linear_exec; +diff --git a/runtime/onert/core/src/compiler/OperationValidator.cc b/runtime/onert/core/src/compiler/OperationValidator.cc +index 5c545ae..fa5ee27 100644 +--- a/runtime/onert/core/src/compiler/OperationValidator.cc ++++ b/runtime/onert/core/src/compiler/OperationValidator.cc +@@ -41,6 +41,21 @@ OperationValidator::OperationValidator(const ir::Graph &graph) + { + } + ++void OperationValidator::checkUnaryOp(const ir::Operation &node) ++{ ++ const auto output_index{node.getOutputs().at(0)}; ++ const auto input_index{node.getInputs().at(0)}; ++ ++ // Check if I/O types match ++ OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type()); ++ ++ if (_ctx.at(output_index).info().isDynamic()) ++ return; ++ ++ // Check if I/O shapes match ++ OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); ++} ++ + void OperationValidator::operator()() + { + // There is no reason for each subgraph to have subgraphs since compiler has subgraphs when +@@ -53,16 +68,7 @@ void OperationValidator::operator()() + [&](const ir::OperationIndex &, const ir::Operation &node) { node.accept(*this); }); + } + +-void OperationValidator::visit(const ir::operation::Abs &node) +-{ +- const auto output_index{node.getOutputs().at(0)}; +- if (_ctx.at(output_index).info().isDynamic()) +- return; +- +- const auto input_index{node.getInputs().at(0)}; +- +- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); +-} ++void OperationValidator::visit(const ir::operation::Abs &node) { checkUnaryOp(node); } + + void OperationValidator::visit(const ir::operation::AvgPool2D &node) + { +@@ -292,17 +298,7 @@ void OperationValidator::visit(const ir::operation::RNN &node) + num_units == _ctx.at(hidden_state_out_index).shape().dim(1)); + } + +-void OperationValidator::visit(const ir::operation::Round &node) +-{ +- const auto output_index{node.getOutputs().at(0)}; +- const auto input_index{node.getInputs().at(ir::operation::Round::Input::INPUT)}; +- +- OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type()); +- +- if (_ctx.at(output_index).info().isDynamic()) +- return; +- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); +-} ++void OperationValidator::visit(const ir::operation::Round &node) { checkUnaryOp(node); } + + void OperationValidator::visit(const ir::operation::SpaceToBatchND &node) + { +@@ -393,17 +389,7 @@ void OperationValidator::visit(const ir::operation::EmbeddingLookup &node) + } + } + +-void OperationValidator::visit(const ir::operation::Exp &node) +-{ +- const auto output_index{node.getOutputs().at(0)}; +- const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)}; +- +- OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type()); +- +- if (_ctx.at(output_index).info().isDynamic()) +- return; +- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); +-} ++void OperationValidator::visit(const ir::operation::Exp &node) { checkUnaryOp(node); } + + void OperationValidator::visit(const ir::operation::ExpandDims &node) + { +@@ -419,17 +405,7 @@ void OperationValidator::visit(const ir::operation::ExpandDims &node) + OP_REQUIRES(_ctx.at(axis_index).shape().rank() <= 1); + } + +-void OperationValidator::visit(const ir::operation::Floor &node) +-{ +- const auto output_index{node.getOutputs().at(0)}; +- const auto input_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)}; +- +- OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type()); +- +- if (_ctx.at(output_index).info().isDynamic()) +- return; +- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); +-} ++void OperationValidator::visit(const ir::operation::Floor &node) { checkUnaryOp(node); } + + void OperationValidator::visit(const ir::operation::HashtableLookup &node) + { +@@ -789,6 +765,25 @@ void OperationValidator::visit(const ir::operation::LSTM &node) + } + } + ++void OperationValidator::visit(const ir::operation::L2Normalization &node) ++{ ++ const auto ofm_index{node.getOutputs().at(0)}; ++ if (_ctx.at(ofm_index).info().isDynamic()) ++ return; ++ ++ const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)}; ++ ++ auto ifm_shape = _ctx.at(ifm_index).shape(); ++ auto ofm_shape = _ctx.at(ofm_index).shape(); ++ ++ OP_REQUIRES(ifm_shape.rank() == ofm_shape.rank()); ++ ++ for (auto i = 0; i < ifm_shape.rank(); i++) ++ { ++ OP_REQUIRES(ifm_shape.dim(i) == ofm_shape.dim(i)); ++ } ++} ++ + void OperationValidator::visit(const ir::operation::Unpack &node) + { + const auto num{node.param().num}; +@@ -904,35 +899,11 @@ void OperationValidator::visit(const ir::operation::Split &node) + OP_REQUIRES(_ctx.at(input_index).shape().dim(axis) % num_splits == 0); + } + +-void OperationValidator::visit(const ir::operation::Cos &node) +-{ +- const auto output_index{node.getOutputs().at(0)}; +- if (_ctx.at(output_index).info().isDynamic()) +- return; +- +- const auto input_index{node.getInputs().at(0)}; +- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); +-} +- +-void OperationValidator::visit(const ir::operation::Sin &node) +-{ +- const auto output_index{node.getOutputs().at(0)}; +- if (_ctx.at(output_index).info().isDynamic()) +- return; ++void OperationValidator::visit(const ir::operation::Cos &node) { checkUnaryOp(node); } + +- const auto input_index{node.getInputs().at(0)}; +- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); +-} ++void OperationValidator::visit(const ir::operation::Sin &node) { checkUnaryOp(node); } + +-void OperationValidator::visit(const ir::operation::RSQRT &node) +-{ +- const auto output_index{node.getOutputs().at(0)}; +- if (_ctx.at(output_index).info().isDynamic()) +- return; +- +- const auto input_index{node.getInputs().at(0)}; +- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); +-} ++void OperationValidator::visit(const ir::operation::RSQRT &node) { checkUnaryOp(node); } + + void OperationValidator::visit(const ir::operation::Shape &node) + { +@@ -972,35 +943,11 @@ void OperationValidator::visit(const ir::operation::While &node) + // TODO Add to validate with subgraphs + } + +-void OperationValidator::visit(const ir::operation::Neg &node) +-{ +- const auto output_index{node.getOutputs().at(0)}; +- if (_ctx.at(output_index).info().isDynamic()) +- return; ++void OperationValidator::visit(const ir::operation::Neg &node) { checkUnaryOp(node); } + +- const auto input_index{node.getInputs().at(0)}; +- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); +-} ++void OperationValidator::visit(const ir::operation::Log &node) { checkUnaryOp(node); } + +-void OperationValidator::visit(const ir::operation::Log &node) +-{ +- const auto output_index{node.getOutputs().at(0)}; +- if (_ctx.at(output_index).info().isDynamic()) +- return; +- +- const auto input_index{node.getInputs().at(0)}; +- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); +-} +- +-void OperationValidator::visit(const ir::operation::LogicalNot &node) +-{ +- const auto output_index{node.getOutputs().at(0)}; +- if (_ctx.at(output_index).info().isDynamic()) +- return; +- +- const auto input_index{node.getInputs().at(0)}; +- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); +-} ++void OperationValidator::visit(const ir::operation::LogicalNot &node) { checkUnaryOp(node); } + + void OperationValidator::visit(const ir::operation::SquaredDifference &node) + { +@@ -1118,5 +1065,25 @@ void OperationValidator::visit(const ir::operation::LogSoftmax &node) + + OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank()); + } ++ ++void OperationValidator::visit(const ir::operation::Quantize &node) ++{ ++ VERBOSE(Quantize) << "Configure Quantize operation" << std::endl; ++ ++ OP_REQUIRES(node.getInputs().size() == 1); ++ OP_REQUIRES(node.getOutputs().size() == 1); ++ ++ const auto input_index{node.getInputs().at(0)}; ++ const auto output_index{node.getOutputs().at(0)}; ++ ++ OP_REQUIRES(_ctx.at(input_index).typeInfo().type() == ir::DataType::FLOAT32); ++ ++ if (_ctx.at(output_index).info().isDynamic()) ++ return; ++ ++ OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM); ++ ++ OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank()); ++} + } // namespace compiler + } // namespace onert +diff --git a/runtime/onert/core/src/compiler/OperationValidator.h b/runtime/onert/core/src/compiler/OperationValidator.h +index 6ceafe8..55a4dd5 100644 +--- a/runtime/onert/core/src/compiler/OperationValidator.h ++++ b/runtime/onert/core/src/compiler/OperationValidator.h +@@ -70,6 +70,7 @@ public: + void visit(const ir::operation::DepthToSpace &node) override; + void visit(const ir::operation::Pack &node) override; + void visit(const ir::operation::LSTM &node) override; ++ void visit(const ir::operation::L2Normalization &node) override; + void visit(const ir::operation::Unpack &node) override; + void visit(const ir::operation::Pad &node) override; + void visit(const ir::operation::Min &node) override; +@@ -93,9 +94,10 @@ public: + void visit(const ir::operation::Range &node) override; + void visit(const ir::operation::MatrixBandPart &node) override; + void visit(const ir::operation::LogSoftmax &node) override; ++ void visit(const ir::operation::Quantize &node) override; + + private: +- void checkReduceOp(const ir::OperandIndex input_index, const ir::OperandIndex output_index); ++ void checkUnaryOp(const ir::Operation &node); + + private: + // TODO Remove _ctx field +diff --git a/runtime/onert/core/src/compiler/StaticShapeInference.cc b/runtime/onert/core/src/compiler/StaticShapeInference.cc +index 5a58f2e..66de599 100644 +--- a/runtime/onert/core/src/compiler/StaticShapeInference.cc ++++ b/runtime/onert/core/src/compiler/StaticShapeInference.cc +@@ -497,6 +497,11 @@ void StaticShapeInferer::visit(const ir::operation::Logistic &op) + handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::Logistic::Input::INPUT)); + } + ++void StaticShapeInferer::visit(const ir::operation::L2Normalization &op) ++{ ++ handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::L2Normalization::Input::INPUT)); ++} ++ + void StaticShapeInferer::visit(const ir::operation::MatrixBandPart &op) + { + handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::MatrixBandPart::Input::INPUT)); +diff --git a/runtime/onert/core/src/compiler/TensorBuilders.h b/runtime/onert/core/src/compiler/TensorBuilders.h +index 4bb7413..c0a1ebc 100644 +--- a/runtime/onert/core/src/compiler/TensorBuilders.h ++++ b/runtime/onert/core/src/compiler/TensorBuilders.h +@@ -23,6 +23,7 @@ + #include "backend/Backend.h" + #include "backend/controlflow/Config.h" + #include "backend/controlflow/TensorBuilder.h" ++#include "util/logging.h" + + namespace onert + { +@@ -66,6 +67,17 @@ public: + return _cf_tensor_builder; + } + ++ std::shared_ptr getITensor(ir::OperandIndex ind) ++ { ++ for (auto &tensor_builder : _tensor_builders) ++ { ++ auto tensor = tensor_builder->tensorAt(ind); ++ if (tensor) ++ return tensor; ++ } ++ return nullptr; ++ } ++ + private: + std::unordered_set> _tensor_builders; + std::shared_ptr _cf_tensor_builder; +diff --git a/runtime/onert/core/src/exec/DynamicShapeInference.cc b/runtime/onert/core/src/exec/DynamicShapeInference.cc +index 1b82029..28e92ba 100644 +--- a/runtime/onert/core/src/exec/DynamicShapeInference.cc ++++ b/runtime/onert/core/src/exec/DynamicShapeInference.cc +@@ -442,6 +442,11 @@ void DynamicShapeInferer::visit(const ir::operation::Logistic &op) + handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::Logistic::INPUT)); + } + ++void DynamicShapeInferer::visit(const ir::operation::L2Normalization &op) ++{ ++ handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::L2Normalization::INPUT)); ++} ++ + void DynamicShapeInferer::visit(const ir::operation::MatrixBandPart &op) + { + handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::MatrixBandPart::INPUT)); +diff --git a/runtime/onert/core/src/exec/ExecutorBase.cc b/runtime/onert/core/src/exec/ExecutorBase.cc +index a7409b9..864ccb3 100644 +--- a/runtime/onert/core/src/exec/ExecutorBase.cc ++++ b/runtime/onert/core/src/exec/ExecutorBase.cc +@@ -46,7 +46,7 @@ ExecutorBase::ExecutorBase(std::unique_ptr &&lowered_graph, + { + auto tensor_registry = tensor_builder->tensorRegistry(); + assert(tensor_registry); +- tensor = tensor_registry->getManagedITensor(ind); ++ tensor = tensor_registry->getNativeITensor(ind); + if (tensor != nullptr) + { + if (tensor_builder->supportDynamicTensor()) +@@ -71,7 +71,7 @@ ExecutorBase::ExecutorBase(std::unique_ptr &&lowered_graph, + { + auto tensor_registry = tensor_builder->tensorRegistry(); + assert(tensor_registry); +- tensor = tensor_registry->getManagedITensor(ind); ++ tensor = tensor_registry->getNativeITensor(ind); + if (tensor != nullptr) + { + if (tensor_builder->supportDynamicTensor()) +diff --git a/runtime/onert/core/src/interp/operations/Pad.cc b/runtime/onert/core/src/interp/operations/Pad.cc +index d2e3627..c8dce69 100644 +--- a/runtime/onert/core/src/interp/operations/Pad.cc ++++ b/runtime/onert/core/src/interp/operations/Pad.cc +@@ -69,8 +69,8 @@ void invoke(const ITensor *input_tensor, const ITensor *pad_tensor, const ITenso + const int32_t *pad_ptr = reinterpret_cast(pad_buffer); + float *output_ptr = reinterpret_cast(output_buffer); + +- nnfw::cker::Pad(pad_ptr, pad_rank, cker_input_shape, input_ptr, cker_output_shape, output_ptr, +- nullptr); ++ nnfw::cker::Pad(pad_ptr, pad_rank, cker_input_shape, input_ptr, cker_output_shape, ++ output_ptr, nullptr); + } + + void invokePad(const ExecEnv *env, const ir::Operation &node) +diff --git a/runtime/onert/core/src/ir/LoweredGraph.cc b/runtime/onert/core/src/ir/LoweredGraph.cc +index 6e93a23..f138089 100644 +--- a/runtime/onert/core/src/ir/LoweredGraph.cc ++++ b/runtime/onert/core/src/ir/LoweredGraph.cc +@@ -122,9 +122,6 @@ LoweredGraph::LoweredGraph(const Graph &graph, const compiler::CompilerOptions & + + pass::PermutationInsertionPass pi_pass(*this); + pi_pass.run(); +- // Implemented code no longer works. +- // pass::PermutationEliminationPass pe_pass(*this); +- // pe_pass.run(); + + _op_seqs.dump("merged and sorted operations with permutation", _graph.operations()); + } +diff --git a/runtime/onert/core/src/ir/operation/Quantize.cc b/runtime/onert/core/src/ir/operation/Quantize.cc +new file mode 100644 +index 0000000..0e3d5b6 +--- /dev/null ++++ b/runtime/onert/core/src/ir/operation/Quantize.cc +@@ -0,0 +1,37 @@ ++/* ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#include "ir/operation/Quantize.h" ++ ++#include "ir/OperationVisitor.h" ++ ++namespace onert ++{ ++namespace ir ++{ ++namespace operation ++{ ++ ++void Quantize::accept(OperationVisitor &v) const { v.visit(*this); } ++ ++Quantize::Quantize(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs) ++ : Operation{OperandConstraint::createExact(2u), inputs, outputs} ++{ ++} ++ ++} // namespace operation ++} // namespace ir ++} // namespace onert +diff --git a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.cc b/runtime/onert/core/src/ir/pass/PermutationEliminationPass.cc +deleted file mode 100644 +index 9e0291e..0000000 +--- a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.cc ++++ /dev/null +@@ -1,195 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-#include "PermutationEliminationPass.h" +- +-#include "ir/Operand.h" +-#include "ir/operand/LowerInfo.h" +-#include "ir/Graph.h" +-#include "backend/IConfig.h" +-#include "util/logging.h" +- +-namespace onert +-{ +-namespace ir +-{ +-namespace pass +-{ +-void PermutationEliminationPass::callback(const OperandIndex &inp_index, Operand &object) +-{ +- if (_graph.getInputs().contains(inp_index)) +- { +- eliminateInput(inp_index, object); +- } +- else if (_graph.getOutputs().contains(inp_index)) +- { +- eliminateOutput(inp_index, object); +- } +-} +- +-void PermutationEliminationPass::eliminateInput(const OperandIndex &inp_index, Operand &object) +-{ +- auto &model_inputs = _graph.getInputs(); +- +- // get uses of the model's given input +- auto uses = object.getUses(); +- +- // input must be used just by permutation +- if (uses.size() != 1) +- { +- return; +- } +- +- for (auto input_use : uses) +- { +- auto &perm_operation = _graph.operations().at(input_use); +- auto perm_inputs = perm_operation.getInputs(); +- +- auto perm_outputs = perm_operation.getOutputs(); +- +- if (!isPermuteLayerToEliminate(perm_inputs, perm_outputs, true)) +- { +- return; +- } +- +- assert(perm_inputs.at(0) == inp_index); +- +- VERBOSE(PermutationEliminationPass::EliminateInput) << "remove NHWC_TO_NCHW permutation\n"; +- +- // set model's new input, which was output of permutation +- model_inputs.replace(inp_index, perm_outputs.at(0)); +- +- // remove model's input, which is also input of permutation +- _graph.removeOperand(inp_index); +- +- // remove permutation operation +- assert(_lowered_graph.op_seqs().containsOperation(input_use)); +- auto op_seq_idx = _lowered_graph.op_seqs().getOperation(input_use); +- _lowered_graph.op_seqs().remove(op_seq_idx); +- _graph.operations().remove(input_use); +- +- VERBOSE(PermutationEliminationPass::EliminateInput) +- << inp_index.value() << " is model's input and is removed. New input is " +- << perm_outputs.at(0).value() << "\n" +- << input_use.value() << " is removed permutation operation\n"; +- } +-} +- +-void PermutationEliminationPass::eliminateOutput(const OperandIndex &out_index, Operand &object) +-{ +- auto &model_outputs = _graph.getOutputs(); +- +- // get defs of the model's given output +- auto defs = object.getDef(); +- +- // output must use just permutation +- if (defs.size() != 1) +- { +- return; +- } +- +- for (auto output_def : defs) +- { +- auto &perm_operation = _graph.operations().at(output_def); +- auto perm_outputs = perm_operation.getOutputs(); +- +- auto perm_inputs = perm_operation.getInputs(); +- if (!isPermuteLayerToEliminate(perm_inputs, perm_outputs, false)) +- { +- return; +- } +- +- assert(perm_outputs.at(0) == out_index); +- +- VERBOSE(PermutationEliminationPass::EliminateOutput) << "remove NCHW_TO_NHWC permutation\n"; +- +- // Update operations' output that is used by permute operand +- for (auto perm_input_index : perm_inputs) +- { +- auto &perm_input_operand = _graph.operands().at(perm_input_index); +- perm_input_operand.removeUse(output_def); +- } +- +- // set model's new output, which was input of permutation +- model_outputs.replace(out_index, perm_inputs.at(0)); +- +- // remove model's output, which is also output of permutation +- _graph.removeOperand(out_index); +- +- // remove permutation operation +- assert(_lowered_graph.op_seqs().containsOperation(output_def)); +- auto op_seq_idx = _lowered_graph.op_seqs().getOperation(output_def); +- _lowered_graph.op_seqs().remove(op_seq_idx); +- _graph.operations().remove(output_def); +- +- VERBOSE(PermutationEliminationPass::EliminateOutput) +- << out_index.value() << " is model's output and is removed. New output is " +- << perm_inputs.at(0).value() << "\n" +- << output_def.value() << " is removed permutation operation\n"; +- } +-} +- +-bool PermutationEliminationPass::isPermuteLayerToEliminate(const OperandIndexSequence &inp_indexes, +- const OperandIndexSequence &out_indexes, +- bool is_for_model_input) +-{ +- auto input_def_factors = _lowered_graph.getLowerInfo(inp_indexes.at(0))->def_factors(); +- auto output_def_factors = _lowered_graph.getLowerInfo(out_indexes.at(0))->def_factors(); +- +- auto input_layout = input_def_factors.getOnlyElement().layout(); +- auto output_layout = output_def_factors.getOnlyElement().layout(); +- +- if (input_def_factors.size() != 1 || output_def_factors.size() != 1) +- { +- return false; +- } +- +- // all operands' factor must be the same +- for (auto index : inp_indexes) +- { +- auto op_factor_set = _lowered_graph.getLowerInfo(index)->def_factors(); +- if (op_factor_set.size() != 1 || +- input_layout != _lowered_graph.getLowerInfo(index)->def_factors().getOnlyElement().layout()) +- { +- return false; +- } +- } +- // all operands' factor must be the same +- for (auto index : out_indexes) +- { +- auto op_factor_set = _lowered_graph.getLowerInfo(index)->def_factors(); +- if (op_factor_set.size() != 1 || +- output_layout != +- _lowered_graph.getLowerInfo(index)->def_factors().getOnlyElement().layout()) +- { +- return false; +- } +- } +- +- if (is_for_model_input) +- { +- // check if this is NHWC_TO_NCHW permutation: must have single input, which is model's input +- return (inp_indexes.size() == 1 && input_layout == Layout::NHWC && +- output_layout == Layout::NCHW); +- } +- +- // check if this is NCHW_TO_NHWC permutation: must have single output, which is model's output +- return (out_indexes.size() == 1 && input_layout == Layout::NCHW && output_layout == Layout::NHWC); +-} +- +-} // namespace pass +-} // namespace ir +-} // namespace onert +diff --git a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.h b/runtime/onert/core/src/ir/pass/PermutationEliminationPass.h +deleted file mode 100644 +index 1c84300..0000000 +--- a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.h ++++ /dev/null +@@ -1,86 +0,0 @@ +-/* +- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-#ifndef __ONERT_GRAPH_PASS_PERMUTATION_ELIMINATION_PASS_H__ +-#define __ONERT_GRAPH_PASS_PERMUTATION_ELIMINATION_PASS_H__ +- +-#include "LoweredOperandPass.h" +-#include "ir/Operand.h" +-#include "ir/OperandIndexSequence.h" +- +-namespace onert +-{ +-namespace ir +-{ +-namespace pass +-{ +- +-class PermutationEliminationPass : public LoweredOperandPass +-{ +-public: +- using LoweredOperandPass::LoweredOperandPass; +- +-public: +- std::string id() override { return "PermutationEliminationPass"; } +- +- void callback(const OperandIndex &index, Operand &object) override; +- +-private: +- /** +- * @brief Remove Permute operation that permutates input +- * +- * Note: This function aslo removes model's input and +- * sets output of permutation as model's new input +- * +- * @param inp_index is the target operand index for the elimination +- * @param object is the target operand object for the elimination +- * +- * @return +- */ +- void eliminateInput(const OperandIndex &inp_index, Operand &object); +- +- /** +- * @brief Remove Permute operation that permutates output of a model +- * +- * Note: This function aslo removes model's output and +- * sets input of permutation as model's new output +- * +- * @param out_index is the target operand index for the elimination +- * @param object is the target operand object for the elimination +- * +- * @return +- */ +- void eliminateOutput(const OperandIndex &out_index, Operand &object); +- +- /** +- * @brief Determine if passed operands are permute layer's input and output, that must be +- * eliminated +- * +- * @param inp_index indexes of the input operand to operation +- * @param out_index indexes of the output operand to operation +- * @param is_for_model_input checking for model's input or output +- * +- * @return if it is permutation layer +- */ +- bool isPermuteLayerToEliminate(const OperandIndexSequence &inp_indexes, +- const OperandIndexSequence &out_indexes, bool is_for_model_input); +-}; +- +-} // namespace pass +-} // namespace ir +-} // namespace onert +- +-#endif // __ONERT_GRAPH_PASS_PERMUTATION_ELIMINATION_PASS_H__ +diff --git a/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc b/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc +index 7c3da52..75efdd8 100644 +--- a/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc ++++ b/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc +@@ -62,27 +62,26 @@ void PermutationInsertionPass::callback(const OperandIndex &index, Operand &obje + auto insert_set = operand_li->use_factors() - operand_li->def_factors(); + auto def_factor = operand_li->def_factors().getOnlyElement(); + +- auto compatible_backends = [](auto /* backend1 */, auto /* backend2 */) { +- // TODO If other issues for Permute elimination are resolved, enable this +- return false; +- /* ++ auto compatible_backends = [](auto backend1, auto backend2) { + // TODO This is a workaround for not inserting Permute between cpu and controlflow. + // To be general, we need another way of checking they are compatible. + const auto cf = backend::controlflow::Config::ID; + const auto cpu = "cpu"; + const auto id1 = backend1->config()->id(); + const auto id2 = backend2->config()->id(); +- return (id1 == cpu && id2 == cf) // Allows no-Permute for Model inputs +- || (id1 == cf && id2 == cpu); // Allows no-Permute for Model outputs +- */ ++ // NOTE This is to skip Permute insertion for model inputs(controlflow -> cpu), but not ++ // outputs. This function currently assumes that backend1 is Def and backend2 is Use. However ++ // it is going to be fixed soon. ++ // TODO make both ways work ++ return (id1 == cpu && id2 == cf); + }; + + for (auto factor : insert_set) + { ++ // Check exceptional cases that Permute ops are not inserted + if (factor.layout() == def_factor.layout() && + compatible_backends(factor.backend(), def_factor.backend())) + { +- // For this factor we can just reuse existing operand - Permute is not added. + VERBOSE(PermutationInsertionPass) << "Permutation Insertion is skipped for operand " + << index << " / as the tensor is compatible with backend " + << factor.backend()->config()->id() << std::endl; +diff --git a/runtime/onert/frontend/base_loader/include/base_loader.h b/runtime/onert/frontend/base_loader/include/base_loader.h +index f5687ad..f763346 100644 +--- a/runtime/onert/frontend/base_loader/include/base_loader.h ++++ b/runtime/onert/frontend/base_loader/include/base_loader.h +@@ -171,6 +171,8 @@ protected: + void loadBroadcastTo(const Operator *op, ir::Graph &subg); + void loadFusedBatchNorm(const Operator *op, ir::Graph &subg); + void loadLogSoftmax(const Operator *op, ir::Graph &subg); ++ void loadQuantize(const Operator *op, ir::Graph &subg); ++ void loadSpaceToDepth(const Operator *op, ir::Graph &subg); + + protected: + // Base address for mapped region for loading (if needed) +@@ -1123,6 +1125,22 @@ void BaseLoader::loadBroadcastTo(const Operator *o + std::unique_ptr new_op(new ir::operation::BroadcastTo(inputs, outputs)); + subg.addOperation(std::move(new_op)); + } ++template ++void BaseLoader::loadSpaceToDepth(const Operator *op, ir::Graph &subg) ++{ ++ ir::OperandIndexSequence inputs; ++ ir::OperandIndexSequence outputs; ++ ir::operation::SpaceToDepth::Param param; ++ ++ const auto *options = op->builtin_options_as_SpaceToDepthOptions(); ++ ++ param.block_size = options->block_size(); ++ ++ loadOperationIO(op, inputs, outputs); ++ ++ std::unique_ptr new_op(new ir::operation::SpaceToDepth(inputs, outputs, param)); ++ subg.addOperation(std::move(new_op)); ++} + + template + void BaseLoader::loadCustom(const Operator *op, ir::Graph &subg) +@@ -1743,6 +1761,18 @@ void BaseLoader::loadLogSoftmax(const Operator *op + } + + template ++void BaseLoader::loadQuantize(const Operator *op, ir::Graph &subg) ++{ ++ ir::OperandIndexSequence inputs; ++ ir::OperandIndexSequence outputs; ++ ++ loadOperationIO(op, inputs, outputs); ++ ++ std::unique_ptr new_op(new ir::operation::Quantize(inputs, outputs)); ++ subg.addOperation(std::move(new_op)); ++} ++ ++template + void BaseLoader::loadOperation(const Operator *op, ir::Graph &subg) + { + const auto builtin_op = _model->operator_codes()->Get(op->opcode_index())->builtin_code(); +@@ -1959,6 +1989,12 @@ void BaseLoader::loadOperation(const Operator *op, + case BuiltinOperator::BuiltinOperator_LOG_SOFTMAX: + loadLogSoftmax(op, subg); + return; ++ case BuiltinOperator::BuiltinOperator_QUANTIZE: ++ loadQuantize(op, subg); ++ return; ++ case BuiltinOperator::BuiltinOperator_SPACE_TO_DEPTH: ++ loadSpaceToDepth(op, subg); ++ return; + default: + throw std::runtime_error( + std::string("Unsupported operation: ").append(EnumNameBuiltinOperator(builtin_op))); +diff --git a/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc b/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc +index 94791f8..00ffcb6 100644 +--- a/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc ++++ b/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc +@@ -106,6 +106,33 @@ getReduceGenerator(const onert::ir::operation::Reduce::ReduceType reduce_type) + }; + } + ++template ++Operation *CreateSimpleUnaryOp(const OperationFactory::Param &init_param, Operands &) ++{ ++ assert(init_param.input_count == 1 && init_param.output_count == 1); ++ ++ OperandIndexSequence outputs{init_param.outputs[0]}; ++ ++ // Each input should be interpreted as follows: ++ // ++ // 0 -> Input Tensor Index ++ OperandIndexSequence inputs{init_param.inputs[0]}; ++ ++ return new T{inputs, outputs}; ++} ++ ++// A generator function for binary ops with no params ++template ++Operation *createSimpleBinaryOp(const OperationFactory::Param &init_param, Operands &) ++{ ++ assert(init_param.input_count == 2 && init_param.output_count == 1); ++ ++ OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]}; ++ OperandIndexSequence outputs{init_param.outputs[0]}; ++ ++ return new T{inputs, outputs}; ++} ++ + } // namespace + + OperationFactory &OperationFactory::get() +@@ -116,20 +143,10 @@ OperationFactory &OperationFactory::get() + + OperationFactory::OperationFactory() + { +- _map[ANEURALNETWORKS_BATCH_TO_SPACE_ND] = [](const OperationFactory::Param &init_param, +- Operands &) { +- assert(init_param.input_count == 2 && init_param.output_count == 1); +- +- OperandIndexSequence outputs{init_param.outputs[0]}; +- +- // Each input should be interpreted as follows: +- // +- // 0 -> Input Tensor Index +- // 1 -> Block size Index +- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]}; +- +- return new operation::BatchToSpaceND{inputs, outputs}; +- }; ++ // Each input should be interpreted as follows: ++ // 0 -> Input Tensor Index ++ // 1 -> Block size Index ++ _map[ANEURALNETWORKS_BATCH_TO_SPACE_ND] = createSimpleBinaryOp; + + _map[ANEURALNETWORKS_DEPTHWISE_CONV_2D] = [](const OperationFactory::Param &init_param, + Operands &operands) { +@@ -724,44 +741,11 @@ OperationFactory::OperationFactory() + return new operation::Squeeze{inputs, outputs, param}; + }; + +- _map[ANEURALNETWORKS_TANH] = [](const OperationFactory::Param &init_param, Operands &) { +- assert(init_param.input_count == 1 && init_param.output_count == 1); +- +- OperandIndexSequence outputs{init_param.outputs[0]}; +- +- // Each input should be interpreted as follows: +- // +- // 0 -> Input Tensor Index +- OperandIndexSequence inputs{init_param.inputs[0]}; +- +- return new operation::Tanh{inputs, outputs}; +- }; +- +- _map[ANEURALNETWORKS_LOG] = [](const OperationFactory::Param &init_param, Operands &) { +- assert(init_param.input_count == 1 && init_param.output_count == 1); ++ _map[ANEURALNETWORKS_TANH] = CreateSimpleUnaryOp; + +- OperandIndexSequence outputs{init_param.outputs[0]}; ++ _map[ANEURALNETWORKS_LOG] = CreateSimpleUnaryOp; + +- // Each input should be interpreted as follows: +- // +- // 0 -> Input Tensor Index +- OperandIndexSequence inputs{init_param.inputs[0]}; +- +- return new operation::Log{inputs, outputs}; +- }; +- +- _map[ANEURALNETWORKS_LOGISTIC] = [](const OperationFactory::Param &init_param, Operands &) { +- assert(init_param.input_count == 1 && init_param.output_count == 1); +- +- OperandIndexSequence outputs{init_param.outputs[0]}; +- +- // Each input should be interpreted as follows: +- // +- // 0 -> Input Tensor Index +- OperandIndexSequence inputs{init_param.inputs[0]}; +- +- return new operation::Logistic{inputs, outputs}; +- }; ++ _map[ANEURALNETWORKS_LOGISTIC] = CreateSimpleUnaryOp; + + _map[ANEURALNETWORKS_DIV] = [](const OperationFactory::Param &init_param, Operands &operands) { + assert(init_param.input_count == 3 && init_param.output_count == 1); +@@ -784,36 +768,16 @@ OperationFactory::OperationFactory() + return new operation::Div{inputs, outputs, param}; + }; + +- _map[ANEURALNETWORKS_EXP] = [](const OperationFactory::Param &init_param, Operands &) { +- assert(init_param.input_count == 1 && init_param.output_count == 1); +- +- OperandIndexSequence outputs{init_param.outputs[0]}; +- +- // Each input should be interpreted as follows: +- // +- // 0 -> Input Tensor Index +- OperandIndexSequence inputs{init_param.inputs[0]}; +- +- return new operation::Exp{inputs, outputs}; +- }; ++ _map[ANEURALNETWORKS_EXP] = CreateSimpleUnaryOp; + + // ANEURALNETWORKS_EXP_EX is deprecated + // TODO Remove ANEURALNETWORKS_EXP_EX + _map[ANEURALNETWORKS_EXP_EX] = _map[ANEURALNETWORKS_EXP]; + +- _map[ANEURALNETWORKS_EXPAND_DIMS] = [](const OperationFactory::Param &init_param, Operands &) { +- assert(init_param.input_count == 2 && init_param.output_count == 1); +- +- OperandIndexSequence outputs{init_param.outputs[0]}; +- +- // Each input should be interpreted as follows: +- // +- // 0 -> Input Tensor Index +- // 1 -> Axis Tensor Index +- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]}; +- +- return new operation::ExpandDims{inputs, outputs}; +- }; ++ // Each input should be interpreted as follows: ++ // 0 -> Input Tensor Index ++ // 1 -> Axis Tensor Index ++ _map[ANEURALNETWORKS_EXPAND_DIMS] = createSimpleBinaryOp; + + _map[ANEURALNETWORKS_GREATER] = [](const OperationFactory::Param &init_param, Operands &) { + assert(init_param.input_count == 2 && init_param.output_count == 1); +@@ -982,19 +946,7 @@ OperationFactory::OperationFactory() + return new operation::Comparison{inputs, outputs, param}; + }; + +- _map[ANEURALNETWORKS_LOGICAL_AND] = [](const OperationFactory::Param &init_param, Operands &) { +- assert(init_param.input_count == 2 && init_param.output_count == 1); +- +- OperandIndexSequence outputs{init_param.outputs[0]}; +- +- // Each input should be interpreted as follows: +- // +- // 0 -> input0 Tensor Index +- // 1 -> input1 Tensor Index +- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]}; +- +- return new operation::LogicalAnd{inputs, outputs}; +- }; ++ _map[ANEURALNETWORKS_LOGICAL_AND] = createSimpleBinaryOp; + + // ANEURALNETWORKS_LOGICAL_AND_EX is deprecated + // TODO Remove ANEURALNETWORKS_LOGICAL_AND_EX +@@ -1018,18 +970,7 @@ OperationFactory::OperationFactory() + return new operation::LogicalAnd{inputs, outputs}; + }; + +- _map[ANEURALNETWORKS_RSQRT] = [](const OperationFactory::Param &init_param, Operands &) { +- assert(init_param.input_count == 1 && init_param.output_count == 1); +- +- OperandIndexSequence outputs{init_param.outputs[0]}; +- +- // Each input should be interpreted as follows: +- // +- // 0 -> Input Tensor Index +- OperandIndexSequence inputs{init_param.inputs[0]}; +- +- return new operation::RSQRT{inputs, outputs}; +- }; ++ _map[ANEURALNETWORKS_RSQRT] = CreateSimpleUnaryOp; + + _map[ANEURALNETWORKS_SELECT] = [](const OperationFactory::Param &init_param, Operands &) { + assert(init_param.input_count == 3 && init_param.output_count == 1); +@@ -1065,18 +1006,7 @@ OperationFactory::OperationFactory() + // TODO Remove ANEURALNETWORKS_RSQRT_EX + _map[ANEURALNETWORKS_RSQRT_EX] = _map[ANEURALNETWORKS_RSQRT]; + +- _map[ANEURALNETWORKS_RELU] = [](const OperationFactory::Param &init_param, Operands &) { +- assert(init_param.input_count == 1 && init_param.output_count == 1); +- +- OperandIndexSequence outputs{init_param.outputs[0]}; +- +- // Each input should be interpreted as follows: +- // +- // 0 -> Input Tensor Index +- OperandIndexSequence inputs{init_param.inputs[0]}; +- +- return new operation::ReLU{inputs, outputs}; +- }; ++ _map[ANEURALNETWORKS_RELU] = CreateSimpleUnaryOp; + + _map[ANEURALNETWORKS_RESIZE_BILINEAR] = [](const OperationFactory::Param &init_param, + Operands &operands) { +@@ -1098,31 +1028,9 @@ OperationFactory::OperationFactory() + return new operation::ResizeBilinear{inputs, outputs, param}; + }; + +- _map[ANEURALNETWORKS_RELU1] = [](const OperationFactory::Param &init_param, Operands &) { +- assert(init_param.input_count == 1 && init_param.output_count == 1); +- +- OperandIndexSequence outputs{init_param.outputs[0]}; +- +- // Each input should be interpreted as follows: +- // +- // 0 -> input Tensor Index +- OperandIndexSequence inputs{init_param.inputs[0]}; ++ _map[ANEURALNETWORKS_RELU1] = CreateSimpleUnaryOp; + +- return new operation::ReLU1{inputs, outputs}; +- }; +- +- _map[ANEURALNETWORKS_RELU6] = [](const OperationFactory::Param &init_param, Operands &) { +- assert(init_param.input_count == 1 && init_param.output_count == 1); +- +- OperandIndexSequence outputs{init_param.outputs[0]}; +- +- // Each input should be interpreted as follows: +- // +- // 0 -> input Tensor Index +- OperandIndexSequence inputs{init_param.inputs[0]}; +- +- return new operation::ReLU6{inputs, outputs}; +- }; ++ _map[ANEURALNETWORKS_RELU6] = CreateSimpleUnaryOp; + + _map[ANEURALNETWORKS_REVERSE_EX] = [](const OperationFactory::Param &init_param, Operands &) { + assert(init_param.input_count == 2 && init_param.output_count == 1); +@@ -1438,18 +1346,7 @@ OperationFactory::OperationFactory() + return new operation::LogicalOr{inputs, outputs}; + }; + +- _map[ANEURALNETWORKS_LOGICAL_NOT] = [](const OperationFactory::Param &init_param, Operands &) { +- assert(init_param.input_count == 1 && init_param.output_count == 1); +- +- OperandIndexSequence outputs{init_param.outputs[0]}; +- +- // Each input should be interpreted as follows: +- // +- // 0 -> input Tensor Index +- OperandIndexSequence inputs{init_param.inputs[0]}; +- +- return new operation::LogicalNot{inputs, outputs}; +- }; ++ _map[ANEURALNETWORKS_LOGICAL_NOT] = CreateSimpleUnaryOp; + + // ANEURALNETWORKS_LOGICAL_NOT_EX is deprecated + // TODO Remove ANEURALNETWORKS_LOGICAL_NOT_EX +@@ -1649,35 +1546,13 @@ OperationFactory::OperationFactory() + // TODO Remove ANEURALNETWORKS_GATHER_EX + _map[ANEURALNETWORKS_GATHER_EX] = _map[ANEURALNETWORKS_GATHER]; + +- _map[ANEURALNETWORKS_NEG] = [](const OperationFactory::Param &init_param, Operands &) { +- assert(init_param.input_count == 1 && init_param.output_count == 1); +- +- OperandIndexSequence outputs{init_param.outputs[0]}; +- +- // Each input should be interpreted as follows: +- // +- // 0 -> Input Tensor Index +- OperandIndexSequence inputs{init_param.inputs[0]}; +- +- return new operation::Neg{inputs, outputs}; +- }; ++ _map[ANEURALNETWORKS_NEG] = CreateSimpleUnaryOp; + + // ANEURALNETWORKS_NEG_EX is deprecated + // TODO Remove ANEURALNETWORKS_NEG_EX + _map[ANEURALNETWORKS_NEG_EX] = _map[ANEURALNETWORKS_NEG]; + +- _map[ANEURALNETWORKS_ABS] = [](const OperationFactory::Param &init_param, Operands &) { +- assert(init_param.input_count == 1 && init_param.output_count == 1); +- +- OperandIndexSequence outputs{init_param.outputs[0]}; +- +- // Each input should be interpreted as follows: +- // +- // 0 -> Input Tensor Index +- OperandIndexSequence inputs{init_param.inputs[0]}; +- +- return new operation::Abs{inputs, outputs}; +- }; ++ _map[ANEURALNETWORKS_ABS] = CreateSimpleUnaryOp; + + // ANEURALNETWORKS_ABS_EX is deprecated + // TODO Remove ANEURALNETWORKS_ABS_EX +@@ -1704,18 +1579,7 @@ OperationFactory::OperationFactory() + // TODO Remove ANEURALNETWORKS_ARGMAX_EX + _map[ANEURALNETWORKS_ARGMAX_EX] = _map[ANEURALNETWORKS_ARGMAX]; + +- _map[ANEURALNETWORKS_DEQUANTIZE] = [](const OperationFactory::Param &init_param, Operands &) { +- assert(init_param.input_count == 1 && init_param.output_count == 1); +- +- OperandIndexSequence outputs{init_param.outputs[0]}; +- +- // Each input should be interpreted as follows: +- // +- // 0 -> Input Tensor Index +- OperandIndexSequence inputs{init_param.inputs[0]}; +- +- return new operation::Dequantize{inputs, outputs}; +- }; ++ _map[ANEURALNETWORKS_DEQUANTIZE] = CreateSimpleUnaryOp; + + _map[ANEURALNETWORKS_MEAN] = [](const OperationFactory::Param &init_param, Operands &operands) { + assert(init_param.input_count == 3 && init_param.output_count == 1); +@@ -1841,31 +1705,24 @@ OperationFactory::OperationFactory() + }; + + _map[ANEURALNETWORKS_PAD] = [](const OperationFactory::Param &init_param, Operands &) { +- assert(init_param.input_count == 2 && init_param.output_count >= 1); ++ assert(init_param.input_count >= 2 && init_param.input_count <= 3 && ++ init_param.output_count >= 1); + + OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]}; ++ if (init_param.input_count == 3) ++ { ++ inputs.append(OperandIndex{init_param.inputs[2]}); ++ } + OperandIndexSequence outputs{init_param.outputs[0]}; + + return new operation::Pad{inputs, outputs}; + }; + +- _map[ANEURALNETWORKS_MINIMUM] = [](const OperationFactory::Param &init_param, Operands &) { +- assert(init_param.input_count == 2 && init_param.output_count == 1); ++ _map[ANEURALNETWORKS_PAD_V2] = _map[ANEURALNETWORKS_PAD]; + +- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]}; +- OperandIndexSequence outputs{init_param.outputs[0]}; ++ _map[ANEURALNETWORKS_MINIMUM] = createSimpleBinaryOp; + +- return new operation::Min{inputs, outputs}; +- }; +- +- _map[ANEURALNETWORKS_MAXIMUM] = [](const OperationFactory::Param &init_param, Operands &) { +- assert(init_param.input_count == 2 && init_param.output_count == 1); +- +- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]}; +- OperandIndexSequence outputs{init_param.outputs[0]}; +- +- return new operation::Max{inputs, outputs}; +- }; ++ _map[ANEURALNETWORKS_MAXIMUM] = createSimpleBinaryOp; + + _map[ANEURALNETWORKS_ONE_HOT_EX] = [](const OperationFactory::Param &init_param, + Operands &operands) { +@@ -1948,34 +1805,15 @@ OperationFactory::OperationFactory() + return new operation::Range{inputs, outputs}; + }; + +- _map[ANEURALNETWORKS_POW] = [](const OperationFactory::Param &init_param, Operands &) { +- assert(init_param.input_count == 2 && init_param.output_count == 1); ++ // Each input should be interpreted as follows: ++ // 0 -> LHS Tensor Index ++ // 1 -> RHS Tensor Index ++ _map[ANEURALNETWORKS_POW] = createSimpleBinaryOp; + +- OperandIndexSequence outputs{init_param.outputs[0]}; +- +- // Each input should be interpreted as follows: +- // +- // 0 -> LHS Tensor Index +- // 1 -> RHS Tensor Index +- +- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]}; +- +- return new operation::Pow{inputs, outputs}; +- }; +- +- _map[ANEURALNETWORKS_FILL_EX] = [](const OperationFactory::Param &init_param, Operands &) { +- assert(init_param.input_count == 2 && init_param.output_count == 1); +- +- // Each input should be interpreted as follows: +- // +- // 0 -> A tensor, specifying the input. +- // 1 -> A 1-D tensor, specifying the value +- +- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]}; +- OperandIndexSequence outputs{init_param.outputs[0]}; +- +- return new operation::Fill{inputs, outputs}; +- }; ++ // Each input should be interpreted as follows: ++ // 0 -> A tensor, specifying the input. ++ // 1 -> A 1-D tensor, specifying the value ++ _map[ANEURALNETWORKS_FILL_EX] = createSimpleBinaryOp; + + _map[ANEURALNETWORKS_ZEROS_LIKE_EX] = [](const OperationFactory::Param &init_param, Operands &) { + assert(init_param.input_count == 1 && init_param.output_count == 1); +@@ -1989,20 +1827,10 @@ OperationFactory::OperationFactory() + return new operation::ZerosLike{inputs, outputs}; + }; + +- _map[ANEURALNETWORKS_TILE] = [](const OperationFactory::Param &init_param, Operands &) { +- assert(init_param.input_count == 2 && init_param.output_count == 1); +- +- OperandIndexSequence outputs{init_param.outputs[0]}; +- +- // Each input should be interpreted as follows: +- // +- // 0 -> Input Tensor Index +- // 1 -> Multiple Tensor Index +- +- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]}; +- +- return new operation::Tile{inputs, outputs}; +- }; ++ // Each input should be interpreted as follows: ++ // 0 -> Input Tensor Index ++ // 1 -> Multiple Tensor Index ++ _map[ANEURALNETWORKS_TILE] = createSimpleBinaryOp; + + _map[ANEURALNETWORKS_MATRIX_BAND_PART_EX] = [](const OperationFactory::Param &init_param, + Operands &) { +@@ -2064,21 +1892,9 @@ OperationFactory::OperationFactory() + return new operation::Einsum{inputs, outputs, param}; + }; + +- _map[ANEURALNETWORKS_BROADCAST_TO_EX] = [](const OperationFactory::Param &init_param, +- Operands &) { +- assert(init_param.input_count == 2 && init_param.output_count == 1); +- +- OperandIndexSequence outputs{init_param.outputs[0]}; +- +- // Each input should be interpreted as follows: +- // +- // 0 -> Input Tensor Index +- // 1 -> int32, int64, An 1-D int tensor Index +- +- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]}; +- +- return new operation::BroadcastTo{inputs, outputs}; +- }; ++ // 0 -> Input Tensor Index ++ // 1 -> int32, int64, An 1-D int tensor Index ++ _map[ANEURALNETWORKS_BROADCAST_TO_EX] = createSimpleBinaryOp; + + _map[ANEURALNETWORKS_FUSED_BATCH_NORM_V3_EX] = [](const OperationFactory::Param &init_param, + Operands &operands) { +@@ -2133,6 +1949,15 @@ OperationFactory::OperationFactory() + + return new operation::LogSoftmax{inputs, outputs, param}; + }; ++ ++ _map[ANEURALNETWORKS_QUANTIZE] = [](const OperationFactory::Param &init_param, Operands &) { ++ assert(init_param.input_count == 1 && init_param.output_count == 1); ++ ++ OperandIndexSequence inputs{init_param.inputs[0]}; ++ OperandIndexSequence outputs{init_param.outputs[0]}; ++ ++ return new operation::Quantize{inputs, outputs}; ++ }; + } + + Operation *OperationFactory::create(ANeuralNetworksOperationType type, +diff --git a/runtime/onert/test/core/exec/ExecInstance.cc b/runtime/onert/test/core/exec/ExecInstance.cc +index cc04347..0fcf372 100644 +--- a/runtime/onert/test/core/exec/ExecInstance.cc ++++ b/runtime/onert/test/core/exec/ExecInstance.cc +@@ -73,9 +73,8 @@ public: + // Compile + auto subgs = std::make_shared(); + subgs->push(onert::ir::SubgraphIndex{0}, graph); +- auto compiler = new onert::compiler::Compiler{subgs}; +- executors = compiler->compile(); +- delete compiler; ++ onert::compiler::Compiler compiler{subgs}; ++ executors = compiler.compile(); + } + + public: +@@ -98,19 +97,17 @@ TEST(ExecInstance, simple) + float output_buffer[4] = {}; + const float output_expected[4] = {5, -2, 0, -1}; + +- auto execution = new onert::exec::Execution(executors); ++ onert::exec::Execution execution{executors}; + +- execution->setInput(input1, reinterpret_cast(input1_buffer), 16); +- execution->setInput(input2, reinterpret_cast(input2_buffer), 16); +- execution->setOutput(output, reinterpret_cast(output_buffer), 16); +- execution->execute(); ++ execution.setInput(input1, reinterpret_cast(input1_buffer), 16); ++ execution.setInput(input2, reinterpret_cast(input2_buffer), 16); ++ execution.setOutput(output, reinterpret_cast(output_buffer), 16); ++ execution.execute(); + + for (auto i = 0; i < 4; i++) + { + EXPECT_EQ(output_buffer[i], output_expected[i]); + } +- +- delete execution; + } + + TEST(ExecInstance, twoCompile) +@@ -118,7 +115,7 @@ TEST(ExecInstance, twoCompile) + auto mockup = CompiledMockUpModel(); + auto graph = mockup.graph; + auto executors1 = mockup.executors; +- auto execution1 = new onert::exec::Execution(executors1); ++ onert::exec::Execution execution1{executors1}; + + auto input1 = IOIndex{0}; + auto input2 = IOIndex{1}; +@@ -129,38 +126,34 @@ TEST(ExecInstance, twoCompile) + float exe1_output_buffer[4] = {}; + const float exe1_output_expected[4] = {5, -2, 0, -1}; + +- execution1->setInput(input1, reinterpret_cast(exe1_input1_buffer), 16); +- execution1->setInput(input2, reinterpret_cast(exe1_input2_buffer), 16); +- execution1->setOutput(output, reinterpret_cast(exe1_output_buffer), 16); ++ execution1.setInput(input1, reinterpret_cast(exe1_input1_buffer), 16); ++ execution1.setInput(input2, reinterpret_cast(exe1_input2_buffer), 16); ++ execution1.setOutput(output, reinterpret_cast(exe1_output_buffer), 16); + + // Make new executor: compile again + auto subgs = std::make_shared(); + subgs->push(onert::ir::SubgraphIndex{0}, graph); +- auto compiler = new onert::compiler::Compiler{subgs}; +- std::shared_ptr executors2 = compiler->compile(); +- auto execution2 = new onert::exec::Execution(executors2); ++ onert::compiler::Compiler compiler{subgs}; ++ std::shared_ptr executors2 = compiler.compile(); ++ onert::exec::Execution execution2{executors2}; + + const float exe2_input1_buffer[4] = {2, 1, -2, 0}; + const float exe2_input2_buffer[4] = {-3, 3, 1, 2}; + float exe2_output_buffer[4] = {}; + const float exe2_output_expected[4] = {2, 5, -2, 7}; + +- execution2->setInput(input1, reinterpret_cast(exe2_input1_buffer), 16); +- execution2->setInput(input2, reinterpret_cast(exe2_input2_buffer), 16); +- execution2->setOutput(output, reinterpret_cast(exe2_output_buffer), 16); ++ execution2.setInput(input1, reinterpret_cast(exe2_input1_buffer), 16); ++ execution2.setInput(input2, reinterpret_cast(exe2_input2_buffer), 16); ++ execution2.setOutput(output, reinterpret_cast(exe2_output_buffer), 16); + +- execution1->execute(); +- execution2->execute(); ++ execution1.execute(); ++ execution2.execute(); + + for (auto i = 0; i < 4; i++) + { + EXPECT_EQ(exe1_output_buffer[i], exe1_output_expected[i]); + EXPECT_EQ(exe2_output_buffer[i], exe2_output_expected[i]); + } +- +- delete compiler; +- delete execution1; +- delete execution2; + } + + // Support two initialized execution instance then ordered execution +@@ -178,32 +171,29 @@ TEST(ExecInstance, twoExecution) + const float exe1_output_expected[4] = {5, -2, 0, -1}; + const float exe2_output_expected[4] = {2, 5, -2, 7}; + +- auto execution1 = new onert::exec::Execution(executors); +- execution1->setInput(input1, reinterpret_cast(exe1_input1_buffer), 16); +- execution1->setInput(input2, reinterpret_cast(exe1_input2_buffer), 16); +- execution1->setOutput(output1, reinterpret_cast(exe1_output_buffer), 16); ++ onert::exec::Execution execution1{executors}; ++ execution1.setInput(input1, reinterpret_cast(exe1_input1_buffer), 16); ++ execution1.setInput(input2, reinterpret_cast(exe1_input2_buffer), 16); ++ execution1.setOutput(output1, reinterpret_cast(exe1_output_buffer), 16); + + const float exe2_input1_buffer[4] = {2, 1, -2, 0}; + const float exe2_input2_buffer[4] = {-3, 3, 1, 2}; + float exe2_output_buffer[4] = {}; + + // Make new execution +- auto execution2 = new onert::exec::Execution(executors); +- execution2->setInput(input1, reinterpret_cast(exe2_input1_buffer), 16); +- execution2->setInput(input2, reinterpret_cast(exe2_input2_buffer), 16); +- execution2->setOutput(output1, reinterpret_cast(exe2_output_buffer), 16); ++ onert::exec::Execution execution2{executors}; ++ execution2.setInput(input1, reinterpret_cast(exe2_input1_buffer), 16); ++ execution2.setInput(input2, reinterpret_cast(exe2_input2_buffer), 16); ++ execution2.setOutput(output1, reinterpret_cast(exe2_output_buffer), 16); + +- execution1->execute(); +- execution2->execute(); ++ execution1.execute(); ++ execution2.execute(); + + for (auto i = 0; i < 4; i++) + { + EXPECT_EQ(exe1_output_buffer[i], exe1_output_expected[i]); + EXPECT_EQ(exe2_output_buffer[i], exe2_output_expected[i]); + } +- +- delete execution1; +- delete execution2; + } + + class Inference +@@ -222,14 +212,12 @@ public: + auto input2 = IOIndex{1}; + auto output1 = IOIndex{0}; + +- auto execution = new onert::exec::Execution(_executors); +- execution->setInput(input1, reinterpret_cast(_input1), 16); +- execution->setInput(input2, reinterpret_cast(_input2), 16); +- execution->setOutput(output1, reinterpret_cast(_output), 16); ++ onert::exec::Execution execution{_executors}; ++ execution.setInput(input1, reinterpret_cast(_input1), 16); ++ execution.setInput(input2, reinterpret_cast(_input2), 16); ++ execution.setOutput(output1, reinterpret_cast(_output), 16); + +- execution->execute(); +- +- delete execution; ++ execution.execute(); + } + + private: +@@ -288,20 +276,18 @@ TEST(ExecInstance, async) + float output_buffer[4] = {}; + const float output_expected[4] = {5, -2, 0, -1}; + +- auto execution = new onert::exec::Execution(executors); ++ onert::exec::Execution execution{executors}; + +- execution->setInput(input1, reinterpret_cast(input1_buffer), 16); +- execution->setInput(input2, reinterpret_cast(input2_buffer), 16); +- execution->setOutput(output, reinterpret_cast(output_buffer), 16); +- execution->startExecute(); +- execution->waitFinish(); ++ execution.setInput(input1, reinterpret_cast(input1_buffer), 16); ++ execution.setInput(input2, reinterpret_cast(input2_buffer), 16); ++ execution.setOutput(output, reinterpret_cast(output_buffer), 16); ++ execution.startExecute(); ++ execution.waitFinish(); + + for (auto i = 0; i < 4; i++) + { + EXPECT_EQ(output_buffer[i], output_expected[i]); + } +- +- delete execution; + } + + } // namespace +diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl +index e50b941..005f61c 100644 +--- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl ++++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl +@@ -23,8 +23,8 @@ GeneratedTests.cast_float16_to_quant8 + GeneratedTests.cast_float16_to_quant8_overflow + GeneratedTests.cast_float32_to_float16 + GeneratedTests.cast_float32_to_float16_relaxed ++GeneratedTests.cast_float32_to_int32_nnfw + GeneratedTests.cast_int32_to_float16 +-GeneratedTests.cast_int32_to_quant8_overflow + GeneratedTests.cast_quant8_to_float16 + GeneratedTests.concat_dynamic_nnfw + GeneratedTests.conv_dynamic_nnfw +@@ -68,6 +68,7 @@ GeneratedTests.gather_float16_7 + GeneratedTests.gather_float16_8 + GeneratedTests.greater_dynamic_float_nnfw + GeneratedTests.greater_equal_dynamic_float_nnfw ++GeneratedTests.l2_normalization_quant8_nnfw + GeneratedTests.less_dynamic_float_nnfw + GeneratedTests.less_equal_dynamic_float_nnfw + GeneratedTests.log_4D_float_nnfw +@@ -106,11 +107,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw + GeneratedTests.one_hot_ex_dynamic_nnfw + GeneratedTests.pack_ex_dynamic_nnfw + GeneratedTests.pad_dynamic_nnfw ++GeneratedTests.pad_v2_1_float ++GeneratedTests.pad_v2_1_quant8 ++GeneratedTests.pad_v2_all_dims ++GeneratedTests.pad_v2_all_dims_quant8 ++GeneratedTests.pad_v2_low_rank ++GeneratedTests.pad_v2_low_rank_quant8 + GeneratedTests.pow_2D_float_nnfw + GeneratedTests.pow_broadcast_float_nnfw + GeneratedTests.pow_broadcast_float_nnfw_2 + GeneratedTests.pow_broadcast_float_nnfw_3 + GeneratedTests.pow_dynamic_nnfw ++GeneratedTests.quantize_quant8 ++GeneratedTests.quantize_quant8_2 ++GeneratedTests.quantize_quant8_3 ++GeneratedTests.quantize_quant8_4 ++GeneratedTests.quantize_quant8_5 ++GeneratedTests.quantize_quant8_6 ++GeneratedTests.quantize_quant8_7 ++GeneratedTests.quantize_quant8_8 ++GeneratedTests.quantize_zero_sized + GeneratedTests.range_ex_float_1 + GeneratedTests.range_ex_float_1_all_constant_inputs + GeneratedTests.range_ex_float_1_dynamic_nnfw +diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon +index c9edee5..d987bf1 100644 +--- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon ++++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon +@@ -23,10 +23,7 @@ GeneratedTests.cast_float16_to_quant8 + GeneratedTests.cast_float16_to_quant8_overflow + GeneratedTests.cast_float32_to_float16 + GeneratedTests.cast_float32_to_float16_relaxed +-GeneratedTests.cast_float32_to_quant8_overflow +-GeneratedTests.cast_float32_to_quant8_overflow_relaxed + GeneratedTests.cast_int32_to_float16 +-GeneratedTests.cast_int32_to_quant8_overflow + GeneratedTests.cast_quant8_to_float16 + GeneratedTests.concat_dynamic_nnfw + GeneratedTests.conv_dynamic_nnfw +@@ -73,6 +70,7 @@ GeneratedTests.gather_float16_8 + GeneratedTests.greater_dynamic_float_nnfw + GeneratedTests.greater_equal_boolean + GeneratedTests.greater_equal_dynamic_float_nnfw ++GeneratedTests.l2_normalization_quant8_nnfw + GeneratedTests.less_boolean + GeneratedTests.less_dynamic_float_nnfw + GeneratedTests.less_equal_dynamic_float_nnfw +@@ -112,11 +110,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw + GeneratedTests.one_hot_ex_dynamic_nnfw + GeneratedTests.pack_ex_dynamic_nnfw + GeneratedTests.pad_dynamic_nnfw ++GeneratedTests.pad_v2_1_float ++GeneratedTests.pad_v2_1_quant8 ++GeneratedTests.pad_v2_all_dims ++GeneratedTests.pad_v2_all_dims_quant8 ++GeneratedTests.pad_v2_low_rank ++GeneratedTests.pad_v2_low_rank_quant8 + GeneratedTests.pow_2D_float_nnfw + GeneratedTests.pow_broadcast_float_nnfw + GeneratedTests.pow_broadcast_float_nnfw_2 + GeneratedTests.pow_broadcast_float_nnfw_3 + GeneratedTests.pow_dynamic_nnfw ++GeneratedTests.quantize_quant8 ++GeneratedTests.quantize_quant8_2 ++GeneratedTests.quantize_quant8_3 ++GeneratedTests.quantize_quant8_4 ++GeneratedTests.quantize_quant8_5 ++GeneratedTests.quantize_quant8_6 ++GeneratedTests.quantize_quant8_7 ++GeneratedTests.quantize_quant8_8 ++GeneratedTests.quantize_zero_sized + GeneratedTests.range_ex_float_1 + GeneratedTests.range_ex_float_1_all_constant_inputs + GeneratedTests.range_ex_float_1_dynamic_nnfw +diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu +index 3cce4f3..bc0ae0f 100644 +--- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu ++++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu +@@ -38,9 +38,6 @@ GeneratedTests.gather_float16_8 + GeneratedTests.hashtable_lookup_float + GeneratedTests.hashtable_lookup_float_4D_nnfw + GeneratedTests.hashtable_lookup_quant8 +-GeneratedTests.l2_normalization +-GeneratedTests.l2_normalization_2 +-GeneratedTests.l2_normalization_large + GeneratedTests.l2_pool_float + GeneratedTests.l2_pool_float_2 + GeneratedTests.l2_pool_float_large +@@ -79,7 +76,6 @@ GeneratedTests.minimum_simple_quant8 + GeneratedTests.neg + GeneratedTests.neg_3D_int_nnfw + GeneratedTests.neg_4D_int_nnfw +-GeneratedTests.pad_quant8_nnfw + GeneratedTests.prelu + GeneratedTests.prelu_broadcast_float_1_nnfw + GeneratedTests.prelu_broadcast_quant8_1_nnfw +@@ -94,6 +90,11 @@ GeneratedTests.prelu_weight_as_input_quant8 + GeneratedTests.prelu_weight_as_input_quant8_2 + GeneratedTests.prelu_weight_as_input_quant8_3 + GeneratedTests.prelu_weight_as_input_quant8_4 ++GeneratedTests.quantize_quant8_5 ++GeneratedTests.quantize_quant8_6 ++GeneratedTests.quantize_quant8_7 ++GeneratedTests.quantize_quant8_8 ++GeneratedTests.quantize_zero_sized + GeneratedTests.reduce_max_quant8 + GeneratedTests.reduce_max_quant8_1_nnfw + GeneratedTests.reduce_max_quant8_2 +@@ -125,13 +126,9 @@ GeneratedTests.select_v1_2_one_dim_quant8 + GeneratedTests.select_v1_2_two_dim_quant8 + GeneratedTests.slice_5 + GeneratedTests.slice_6 +-GeneratedTests.slice_7 + GeneratedTests.slice_8 + GeneratedTests.slice_zero_sized + GeneratedTests.slice_zero_sized_quant8 +-GeneratedTests.space_to_depth_float_1 +-GeneratedTests.space_to_depth_float_2 +-GeneratedTests.space_to_depth_float_3 + GeneratedTests.space_to_depth_quant8_1 + GeneratedTests.space_to_depth_quant8_2 + GeneratedTests.sqrt_ +diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl +index e50b941..005f61c 100644 +--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl ++++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl +@@ -23,8 +23,8 @@ GeneratedTests.cast_float16_to_quant8 + GeneratedTests.cast_float16_to_quant8_overflow + GeneratedTests.cast_float32_to_float16 + GeneratedTests.cast_float32_to_float16_relaxed ++GeneratedTests.cast_float32_to_int32_nnfw + GeneratedTests.cast_int32_to_float16 +-GeneratedTests.cast_int32_to_quant8_overflow + GeneratedTests.cast_quant8_to_float16 + GeneratedTests.concat_dynamic_nnfw + GeneratedTests.conv_dynamic_nnfw +@@ -68,6 +68,7 @@ GeneratedTests.gather_float16_7 + GeneratedTests.gather_float16_8 + GeneratedTests.greater_dynamic_float_nnfw + GeneratedTests.greater_equal_dynamic_float_nnfw ++GeneratedTests.l2_normalization_quant8_nnfw + GeneratedTests.less_dynamic_float_nnfw + GeneratedTests.less_equal_dynamic_float_nnfw + GeneratedTests.log_4D_float_nnfw +@@ -106,11 +107,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw + GeneratedTests.one_hot_ex_dynamic_nnfw + GeneratedTests.pack_ex_dynamic_nnfw + GeneratedTests.pad_dynamic_nnfw ++GeneratedTests.pad_v2_1_float ++GeneratedTests.pad_v2_1_quant8 ++GeneratedTests.pad_v2_all_dims ++GeneratedTests.pad_v2_all_dims_quant8 ++GeneratedTests.pad_v2_low_rank ++GeneratedTests.pad_v2_low_rank_quant8 + GeneratedTests.pow_2D_float_nnfw + GeneratedTests.pow_broadcast_float_nnfw + GeneratedTests.pow_broadcast_float_nnfw_2 + GeneratedTests.pow_broadcast_float_nnfw_3 + GeneratedTests.pow_dynamic_nnfw ++GeneratedTests.quantize_quant8 ++GeneratedTests.quantize_quant8_2 ++GeneratedTests.quantize_quant8_3 ++GeneratedTests.quantize_quant8_4 ++GeneratedTests.quantize_quant8_5 ++GeneratedTests.quantize_quant8_6 ++GeneratedTests.quantize_quant8_7 ++GeneratedTests.quantize_quant8_8 ++GeneratedTests.quantize_zero_sized + GeneratedTests.range_ex_float_1 + GeneratedTests.range_ex_float_1_all_constant_inputs + GeneratedTests.range_ex_float_1_dynamic_nnfw +diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon +index 55cfe39..051fbc7 100644 +--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon ++++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon +@@ -23,10 +23,7 @@ GeneratedTests.cast_float16_to_quant8 + GeneratedTests.cast_float16_to_quant8_overflow + GeneratedTests.cast_float32_to_float16 + GeneratedTests.cast_float32_to_float16_relaxed +-GeneratedTests.cast_float32_to_quant8_overflow +-GeneratedTests.cast_float32_to_quant8_overflow_relaxed + GeneratedTests.cast_int32_to_float16 +-GeneratedTests.cast_int32_to_quant8_overflow + GeneratedTests.cast_quant8_to_float16 + GeneratedTests.concat_dynamic_nnfw + GeneratedTests.conv_dynamic_nnfw +@@ -73,6 +70,7 @@ GeneratedTests.greater_dynamic_float_nnfw + GeneratedTests.greater_equal_boolean + GeneratedTests.greater_equal_dynamic_float_nnfw + GeneratedTests.less_boolean ++GeneratedTests.l2_normalization_quant8_nnfw + GeneratedTests.less_dynamic_float_nnfw + GeneratedTests.less_equal_dynamic_float_nnfw + GeneratedTests.log_4D_float_nnfw +@@ -111,11 +109,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw + GeneratedTests.one_hot_ex_dynamic_nnfw + GeneratedTests.pack_ex_dynamic_nnfw + GeneratedTests.pad_dynamic_nnfw ++GeneratedTests.pad_v2_1_float ++GeneratedTests.pad_v2_1_quant8 ++GeneratedTests.pad_v2_all_dims ++GeneratedTests.pad_v2_all_dims_quant8 ++GeneratedTests.pad_v2_low_rank ++GeneratedTests.pad_v2_low_rank_quant8 + GeneratedTests.pow_2D_float_nnfw + GeneratedTests.pow_broadcast_float_nnfw + GeneratedTests.pow_broadcast_float_nnfw_2 + GeneratedTests.pow_broadcast_float_nnfw_3 + GeneratedTests.pow_dynamic_nnfw ++GeneratedTests.quantize_quant8 ++GeneratedTests.quantize_quant8_2 ++GeneratedTests.quantize_quant8_3 ++GeneratedTests.quantize_quant8_4 ++GeneratedTests.quantize_quant8_5 ++GeneratedTests.quantize_quant8_6 ++GeneratedTests.quantize_quant8_7 ++GeneratedTests.quantize_quant8_8 ++GeneratedTests.quantize_zero_sized + GeneratedTests.range_ex_float_1 + GeneratedTests.range_ex_float_1_all_constant_inputs + GeneratedTests.range_ex_float_1_dynamic_nnfw +diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu +index 3cce4f3..bc0ae0f 100644 +--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu ++++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu +@@ -38,9 +38,6 @@ GeneratedTests.gather_float16_8 + GeneratedTests.hashtable_lookup_float + GeneratedTests.hashtable_lookup_float_4D_nnfw + GeneratedTests.hashtable_lookup_quant8 +-GeneratedTests.l2_normalization +-GeneratedTests.l2_normalization_2 +-GeneratedTests.l2_normalization_large + GeneratedTests.l2_pool_float + GeneratedTests.l2_pool_float_2 + GeneratedTests.l2_pool_float_large +@@ -79,7 +76,6 @@ GeneratedTests.minimum_simple_quant8 + GeneratedTests.neg + GeneratedTests.neg_3D_int_nnfw + GeneratedTests.neg_4D_int_nnfw +-GeneratedTests.pad_quant8_nnfw + GeneratedTests.prelu + GeneratedTests.prelu_broadcast_float_1_nnfw + GeneratedTests.prelu_broadcast_quant8_1_nnfw +@@ -94,6 +90,11 @@ GeneratedTests.prelu_weight_as_input_quant8 + GeneratedTests.prelu_weight_as_input_quant8_2 + GeneratedTests.prelu_weight_as_input_quant8_3 + GeneratedTests.prelu_weight_as_input_quant8_4 ++GeneratedTests.quantize_quant8_5 ++GeneratedTests.quantize_quant8_6 ++GeneratedTests.quantize_quant8_7 ++GeneratedTests.quantize_quant8_8 ++GeneratedTests.quantize_zero_sized + GeneratedTests.reduce_max_quant8 + GeneratedTests.reduce_max_quant8_1_nnfw + GeneratedTests.reduce_max_quant8_2 +@@ -125,13 +126,9 @@ GeneratedTests.select_v1_2_one_dim_quant8 + GeneratedTests.select_v1_2_two_dim_quant8 + GeneratedTests.slice_5 + GeneratedTests.slice_6 +-GeneratedTests.slice_7 + GeneratedTests.slice_8 + GeneratedTests.slice_zero_sized + GeneratedTests.slice_zero_sized_quant8 +-GeneratedTests.space_to_depth_float_1 +-GeneratedTests.space_to_depth_float_2 +-GeneratedTests.space_to_depth_float_3 + GeneratedTests.space_to_depth_quant8_1 + GeneratedTests.space_to_depth_quant8_2 + GeneratedTests.sqrt_ +diff --git a/tests/nnapi/nnapi_gtest.skip.noarch.interp b/tests/nnapi/nnapi_gtest.skip.noarch.interp +index 08118ca..069d367 100644 +--- a/tests/nnapi/nnapi_gtest.skip.noarch.interp ++++ b/tests/nnapi/nnapi_gtest.skip.noarch.interp +@@ -188,6 +188,7 @@ GeneratedTests.hashtable_lookup_quant8 + GeneratedTests.l2_normalization + GeneratedTests.l2_normalization_2 + GeneratedTests.l2_normalization_large ++GeneratedTests.l2_normalization_quant8_nnfw + GeneratedTests.l2_pool_float + GeneratedTests.l2_pool_float_2 + GeneratedTests.l2_pool_float_large +@@ -312,6 +313,12 @@ GeneratedTests.pack_ex_2D_int_2 + GeneratedTests.pack_ex_dynamic_nnfw + GeneratedTests.pad_dynamic_nnfw + GeneratedTests.pad_quant8_nnfw ++GeneratedTests.pad_v2_1_float ++GeneratedTests.pad_v2_1_quant8 ++GeneratedTests.pad_v2_all_dims ++GeneratedTests.pad_v2_all_dims_quant8 ++GeneratedTests.pad_v2_low_rank ++GeneratedTests.pad_v2_low_rank_quant8 + GeneratedTests.pow_2D_float_nnfw + GeneratedTests.pow_broadcast_float_nnfw + GeneratedTests.pow_broadcast_float_nnfw_2 +@@ -331,6 +338,15 @@ GeneratedTests.prelu_weight_as_input_quant8 + GeneratedTests.prelu_weight_as_input_quant8_2 + GeneratedTests.prelu_weight_as_input_quant8_3 + GeneratedTests.prelu_weight_as_input_quant8_4 ++GeneratedTests.quantize_quant8 ++GeneratedTests.quantize_quant8_2 ++GeneratedTests.quantize_quant8_3 ++GeneratedTests.quantize_quant8_4 ++GeneratedTests.quantize_quant8_5 ++GeneratedTests.quantize_quant8_6 ++GeneratedTests.quantize_quant8_7 ++GeneratedTests.quantize_quant8_8 ++GeneratedTests.quantize_zero_sized + GeneratedTests.range_ex_float_1 + GeneratedTests.range_ex_float_1_all_constant_inputs + GeneratedTests.range_ex_float_1_dynamic_nnfw +diff --git a/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu b/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu +index 3cce4f3..bc0ae0f 100644 +--- a/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu ++++ b/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu +@@ -38,9 +38,6 @@ GeneratedTests.gather_float16_8 + GeneratedTests.hashtable_lookup_float + GeneratedTests.hashtable_lookup_float_4D_nnfw + GeneratedTests.hashtable_lookup_quant8 +-GeneratedTests.l2_normalization +-GeneratedTests.l2_normalization_2 +-GeneratedTests.l2_normalization_large + GeneratedTests.l2_pool_float + GeneratedTests.l2_pool_float_2 + GeneratedTests.l2_pool_float_large +@@ -79,7 +76,6 @@ GeneratedTests.minimum_simple_quant8 + GeneratedTests.neg + GeneratedTests.neg_3D_int_nnfw + GeneratedTests.neg_4D_int_nnfw +-GeneratedTests.pad_quant8_nnfw + GeneratedTests.prelu + GeneratedTests.prelu_broadcast_float_1_nnfw + GeneratedTests.prelu_broadcast_quant8_1_nnfw +@@ -94,6 +90,11 @@ GeneratedTests.prelu_weight_as_input_quant8 + GeneratedTests.prelu_weight_as_input_quant8_2 + GeneratedTests.prelu_weight_as_input_quant8_3 + GeneratedTests.prelu_weight_as_input_quant8_4 ++GeneratedTests.quantize_quant8_5 ++GeneratedTests.quantize_quant8_6 ++GeneratedTests.quantize_quant8_7 ++GeneratedTests.quantize_quant8_8 ++GeneratedTests.quantize_zero_sized + GeneratedTests.reduce_max_quant8 + GeneratedTests.reduce_max_quant8_1_nnfw + GeneratedTests.reduce_max_quant8_2 +@@ -125,13 +126,9 @@ GeneratedTests.select_v1_2_one_dim_quant8 + GeneratedTests.select_v1_2_two_dim_quant8 + GeneratedTests.slice_5 + GeneratedTests.slice_6 +-GeneratedTests.slice_7 + GeneratedTests.slice_8 + GeneratedTests.slice_zero_sized + GeneratedTests.slice_zero_sized_quant8 +-GeneratedTests.space_to_depth_float_1 +-GeneratedTests.space_to_depth_float_2 +-GeneratedTests.space_to_depth_float_3 + GeneratedTests.space_to_depth_quant8_1 + GeneratedTests.space_to_depth_quant8_2 + GeneratedTests.sqrt_ +diff --git a/tests/nnapi/specs/V1_0/l2_normalization_quant8_nnfw.mod.py b/tests/nnapi/specs/V1_0/l2_normalization_quant8_nnfw.mod.py +new file mode 100644 +index 0000000..ca3770c +--- /dev/null ++++ b/tests/nnapi/specs/V1_0/l2_normalization_quant8_nnfw.mod.py +@@ -0,0 +1,30 @@ ++# ++# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved ++# Copyright (C) 2017 The Android Open Source Project ++# ++# Licensed under the Apache License, Version 2.0 (the "License"); ++# you may not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an "AS IS" BASIS, ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++# ++ ++model = Model() ++in0 = Input("op1", "TENSOR_QUANT8_ASYMM", "{1, 1, 1, 3}, 2e-7, 128") ++out0 = Output("op2", "TENSOR_QUANT8_ASYMM", "{1, 1, 1, 3}, 2e-7, 128") ++model = model.Operation("L2_NORMALIZATION", in0).To(out0) ++ ++# Example 1. Input in operand 0, ++input0 = {in0: # input 0 ++ [0, 5, 12]} ++output0 = {out0: # output 0 ++ [51, 54, 58]} ++ ++# Instantiate an example ++Example((input0, output0)) +diff --git a/tests/nnapi/specs/V1_2/pad_v2_1_float.mod.py b/tests/nnapi/specs/V1_2/pad_v2_1_float.mod.py +new file mode 100644 +index 0000000..c500741 +--- /dev/null ++++ b/tests/nnapi/specs/V1_2/pad_v2_1_float.mod.py +@@ -0,0 +1,35 @@ ++# ++# Copyright (C) 2018 The Android Open Source Project ++# ++# Licensed under the Apache License, Version 2.0 (the "License"); ++# you may not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an "AS IS" BASIS, ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++# ++ ++input0 = Input("input0", "TENSOR_FLOAT32", "{1, 2, 3, 1}") ++paddings = Parameter("paddings", "TENSOR_INT32", "{4, 2}", [0, 0, ++ 0, 2, ++ 1, 3, ++ 0, 0]) ++pad_value = Float32Scalar("pad_value", 9.3) ++output0 = Output("output0", "TENSOR_FLOAT32", "{1, 4, 7, 1}") ++ ++model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0) ++ ++Example(({ ++ input0: [1.0, 2.0, 3.0, ++ 4.0, 5.0, 6.0], ++}, { ++ output0: [9.3, 1.0, 2.0, 3.0, 9.3, 9.3, 9.3, ++ 9.3, 4.0, 5.0, 6.0, 9.3, 9.3, 9.3, ++ 9.3, 9.3, 9.3, 9.3, 9.3, 9.3, 9.3, ++ 9.3, 9.3, 9.3, 9.3, 9.3, 9.3, 9.3], ++})).AddVariations("float16", "relaxed") +diff --git a/tests/nnapi/specs/V1_2/pad_v2_1_quant8.mod.py b/tests/nnapi/specs/V1_2/pad_v2_1_quant8.mod.py +new file mode 100644 +index 0000000..3dfaff6 +--- /dev/null ++++ b/tests/nnapi/specs/V1_2/pad_v2_1_quant8.mod.py +@@ -0,0 +1,35 @@ ++# ++# Copyright (C) 2018 The Android Open Source Project ++# ++# Licensed under the Apache License, Version 2.0 (the "License"); ++# you may not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an "AS IS" BASIS, ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++# ++ ++input0 = Input("input0", "TENSOR_QUANT8_ASYMM", "{1, 2, 3, 1}, 2.3, 4") ++paddings = Parameter("paddings", "TENSOR_INT32", "{4, 2}", [0, 0, ++ 0, 2, ++ 1, 3, ++ 0, 0]) ++pad_value = Int32Scalar("pad_value", 9) ++output0 = Output("output0", "TENSOR_QUANT8_ASYMM", "{1, 4, 7, 1}, 2.3, 4") ++ ++model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0) ++ ++Example(({ ++ input0: [1, 2, 3, ++ 4, 5, 6], ++}, { ++ output0: [9, 1, 2, 3, 9, 9, 9, ++ 9, 4, 5, 6, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9], ++})) +diff --git a/tests/nnapi/specs/V1_2/pad_v2_all_dims.mod.py b/tests/nnapi/specs/V1_2/pad_v2_all_dims.mod.py +new file mode 100644 +index 0000000..5b27f49 +--- /dev/null ++++ b/tests/nnapi/specs/V1_2/pad_v2_all_dims.mod.py +@@ -0,0 +1,40 @@ ++# ++# Copyright (C) 2019 The Android Open Source Project ++# ++# Licensed under the Apache License, Version 2.0 (the "License"); ++# you may not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an "AS IS" BASIS, ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++# ++ ++import numpy as np ++ ++input0 = Input("input0", "TENSOR_FLOAT32", "{1, 1, 2, 3}") ++paddings = Parameter("paddings", "TENSOR_INT32", "{4, 2}", [1, 2, ++ 3, 4, ++ 3, 3, ++ 2, 1]) ++pad_value = Float32Scalar("pad_value", 3.9) ++output0 = Output("output0", "TENSOR_FLOAT32", "{4, 8, 8, 6}") ++ ++model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0) ++ ++Example({ ++ input0: [1.0, 2.0, 3.0, ++ 4.0, 5.0, 6.0], ++ output0: np.pad([[[[1.0, 2.0, 3.0], ++ [4.0, 5.0, 6.0]]]], ++ [[1, 2], ++ [3, 4], ++ [3, 3], ++ [2, 1]], ++ "constant", ++ constant_values=3.9).flatten().tolist(), ++}).AddVariations("float16", "relaxed") +diff --git a/tests/nnapi/specs/V1_2/pad_v2_all_dims_quant8.mod.py b/tests/nnapi/specs/V1_2/pad_v2_all_dims_quant8.mod.py +new file mode 100644 +index 0000000..5ee4b06 +--- /dev/null ++++ b/tests/nnapi/specs/V1_2/pad_v2_all_dims_quant8.mod.py +@@ -0,0 +1,40 @@ ++# ++# Copyright (C) 2019 The Android Open Source Project ++# ++# Licensed under the Apache License, Version 2.0 (the "License"); ++# you may not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an "AS IS" BASIS, ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++# ++ ++import numpy ++ ++input0 = Input("input0", "TENSOR_QUANT8_ASYMM", "{1, 1, 2, 3}, 2.3, 4") ++paddings = Parameter("paddings", "TENSOR_INT32", "{4, 2}", [1, 2, ++ 3, 4, ++ 3, 3, ++ 2, 1]) ++pad_value = Int32Scalar("pad_value", 3) ++output0 = Output("output0", "TENSOR_QUANT8_ASYMM", "{4, 8, 8, 6}, 2.3, 4") ++ ++model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0) ++ ++Example({ ++ input0: [1, 2, 3, ++ 4, 5, 6], ++ output0: np.pad([[[[1, 2, 3], ++ [4, 5, 6]]]], ++ [[1, 2], ++ [3, 4], ++ [3, 3], ++ [2, 1]], ++ "constant", ++ constant_values=3).flatten().tolist(), ++}) +diff --git a/tests/nnapi/specs/V1_2/pad_v2_low_rank.mod.py b/tests/nnapi/specs/V1_2/pad_v2_low_rank.mod.py +new file mode 100644 +index 0000000..391d5cf +--- /dev/null ++++ b/tests/nnapi/specs/V1_2/pad_v2_low_rank.mod.py +@@ -0,0 +1,27 @@ ++# ++# Copyright (C) 2019 The Android Open Source Project ++# ++# Licensed under the Apache License, Version 2.0 (the "License"); ++# you may not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an "AS IS" BASIS, ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++# ++ ++input0 = Input("input0", "TENSOR_FLOAT32", "{3}") ++paddings = Parameter("paddings", "TENSOR_INT32", "{1, 2}", [3, 1]) ++pad_value = Float32Scalar("pad_value", 9.9) ++output0 = Output("output0", "TENSOR_FLOAT32", "{7}") ++ ++model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0) ++ ++Example({ ++ input0: [1.0, 2.0, 3.0], ++ output0: [9.9, 9.9, 9.9, 1.0, 2.0, 3.0, 9.9], ++}).AddVariations("float16") +diff --git a/tests/nnapi/specs/V1_2/pad_v2_low_rank_quant8.mod.py b/tests/nnapi/specs/V1_2/pad_v2_low_rank_quant8.mod.py +new file mode 100644 +index 0000000..b67c2b8 +--- /dev/null ++++ b/tests/nnapi/specs/V1_2/pad_v2_low_rank_quant8.mod.py +@@ -0,0 +1,27 @@ ++# ++# Copyright (C) 2019 The Android Open Source Project ++# ++# Licensed under the Apache License, Version 2.0 (the "License"); ++# you may not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an "AS IS" BASIS, ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++# ++ ++input0 = Input("input0", "TENSOR_QUANT8_ASYMM", "{3}, 2.3, 4") ++paddings = Parameter("paddings", "TENSOR_INT32", "{1, 2}", [3, 1]) ++pad_value = Int32Scalar("pad_value", 9) ++output0 = Output("output0", "TENSOR_QUANT8_ASYMM", "{7}, 2.3, 4") ++ ++model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0) ++ ++Example({ ++ input0: [1, 2, 3], ++ output0: [9, 9, 9, 1, 2, 3, 9], ++}) +diff --git a/tests/nnapi/specs/V1_2/quantize.mod.py b/tests/nnapi/specs/V1_2/quantize.mod.py +new file mode 100644 +index 0000000..a42624d +--- /dev/null ++++ b/tests/nnapi/specs/V1_2/quantize.mod.py +@@ -0,0 +1,69 @@ ++# ++# Copyright (C) 2018 The Android Open Source Project ++# ++# Licensed under the Apache License, Version 2.0 (the "License"); ++# you may not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an "AS IS" BASIS, ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++# ++ ++import numpy as np ++ ++num_values = 300 ++values = list(np.linspace(-10, 10, num_values)) ++ ++for input_type in ["TENSOR_FLOAT32", "TENSOR_FLOAT16"]: ++ for scale, offset in [(1.0, 0), ++ (1.0, 1), ++ (0.01, 120), ++ (10.0, 120)]: ++ input0 = Input("input0", input_type, "{%d}" % num_values) ++ output0 = Output("output0", input_type, "{%d}" % num_values) ++ ++ model = Model().Operation("QUANTIZE", input0).To(output0) ++ ++ quantizeOutput = DataTypeConverter().Identify({ ++ output0: ["TENSOR_QUANT8_ASYMM", scale, offset], ++ }) ++ ++ Example({ ++ input0: values, ++ output0: values, ++ }).AddVariations(quantizeOutput, includeDefault=False) ++ ++ ++# Zero-sized input ++ ++# Use BOX_WITH_NMS_LIMIT op to generate a zero-sized internal tensor for box cooridnates. ++p1 = Parameter("scores", "TENSOR_FLOAT32", "{1, 2}", [0.90, 0.10]) # scores ++p2 = Parameter("roi", "TENSOR_FLOAT32", "{1, 8}", [1, 1, 10, 10, 0, 0, 10, 10]) # roi ++o1 = Output("scoresOut", "TENSOR_FLOAT32", "{0}") # scores out ++o2 = Output("classesOut", "TENSOR_INT32", "{0}") # classes out ++tmp1 = Internal("roiOut", "TENSOR_FLOAT32", "{0, 4}") # roi out ++tmp2 = Internal("batchSplitOut", "TENSOR_INT32", "{0}") # batch split out ++model = Model("zero_sized").Operation("BOX_WITH_NMS_LIMIT", p1, p2, [0], 0.3, -1, 0, 0.4, 1.0, 0.3).To(o1, tmp1, o2, tmp2) ++ ++# Use ROI_ALIGN op to convert into zero-sized feature map. ++layout = BoolScalar("layout", False) # NHWC ++i1 = Input("in", "TENSOR_FLOAT32", "{1, 1, 1, 1}") ++zero_sized = Internal("featureMap", "TENSOR_FLOAT32", "{0, 2, 2, 1}") ++model = model.Operation("ROI_ALIGN", i1, tmp1, tmp2, 2, 2, 2.0, 2.0, 4, 4, layout).To(zero_sized) ++ ++# QUANTIZE op with numBatches = 0. ++o3 = Output("out", "TENSOR_QUANT8_ASYMM", "{0, 2, 2, 1}, 0.1f, 128") # out ++model = model.Operation("QUANTIZE", zero_sized).To(o3) ++ ++# Create test case with dummy values. ++Example({ ++ i1: [1], ++ o1: [0], ++ o2: [0], ++ o3: [0], ++}).AddVariations("relaxed", "float16") +diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_1_float.mod.py b/tests/nnapi/specs/skip/V1_2/pad_v2_1_float.mod.py +deleted file mode 100644 +index c500741..0000000 +--- a/tests/nnapi/specs/skip/V1_2/pad_v2_1_float.mod.py ++++ /dev/null +@@ -1,35 +0,0 @@ +-# +-# Copyright (C) 2018 The Android Open Source Project +-# +-# Licensed under the Apache License, Version 2.0 (the "License"); +-# you may not use this file except in compliance with the License. +-# You may obtain a copy of the License at +-# +-# http://www.apache.org/licenses/LICENSE-2.0 +-# +-# Unless required by applicable law or agreed to in writing, software +-# distributed under the License is distributed on an "AS IS" BASIS, +-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-# See the License for the specific language governing permissions and +-# limitations under the License. +-# +- +-input0 = Input("input0", "TENSOR_FLOAT32", "{1, 2, 3, 1}") +-paddings = Parameter("paddings", "TENSOR_INT32", "{4, 2}", [0, 0, +- 0, 2, +- 1, 3, +- 0, 0]) +-pad_value = Float32Scalar("pad_value", 9.3) +-output0 = Output("output0", "TENSOR_FLOAT32", "{1, 4, 7, 1}") +- +-model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0) +- +-Example(({ +- input0: [1.0, 2.0, 3.0, +- 4.0, 5.0, 6.0], +-}, { +- output0: [9.3, 1.0, 2.0, 3.0, 9.3, 9.3, 9.3, +- 9.3, 4.0, 5.0, 6.0, 9.3, 9.3, 9.3, +- 9.3, 9.3, 9.3, 9.3, 9.3, 9.3, 9.3, +- 9.3, 9.3, 9.3, 9.3, 9.3, 9.3, 9.3], +-})).AddVariations("float16", "relaxed") +diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_1_quant8.mod.py b/tests/nnapi/specs/skip/V1_2/pad_v2_1_quant8.mod.py +deleted file mode 100644 +index 3dfaff6..0000000 +--- a/tests/nnapi/specs/skip/V1_2/pad_v2_1_quant8.mod.py ++++ /dev/null +@@ -1,35 +0,0 @@ +-# +-# Copyright (C) 2018 The Android Open Source Project +-# +-# Licensed under the Apache License, Version 2.0 (the "License"); +-# you may not use this file except in compliance with the License. +-# You may obtain a copy of the License at +-# +-# http://www.apache.org/licenses/LICENSE-2.0 +-# +-# Unless required by applicable law or agreed to in writing, software +-# distributed under the License is distributed on an "AS IS" BASIS, +-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-# See the License for the specific language governing permissions and +-# limitations under the License. +-# +- +-input0 = Input("input0", "TENSOR_QUANT8_ASYMM", "{1, 2, 3, 1}, 2.3, 4") +-paddings = Parameter("paddings", "TENSOR_INT32", "{4, 2}", [0, 0, +- 0, 2, +- 1, 3, +- 0, 0]) +-pad_value = Int32Scalar("pad_value", 9) +-output0 = Output("output0", "TENSOR_QUANT8_ASYMM", "{1, 4, 7, 1}, 2.3, 4") +- +-model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0) +- +-Example(({ +- input0: [1, 2, 3, +- 4, 5, 6], +-}, { +- output0: [9, 1, 2, 3, 9, 9, 9, +- 9, 4, 5, 6, 9, 9, 9, +- 9, 9, 9, 9, 9, 9, 9, +- 9, 9, 9, 9, 9, 9, 9], +-})) +diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims.mod.py b/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims.mod.py +deleted file mode 100644 +index 5b27f49..0000000 +--- a/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims.mod.py ++++ /dev/null +@@ -1,40 +0,0 @@ +-# +-# Copyright (C) 2019 The Android Open Source Project +-# +-# Licensed under the Apache License, Version 2.0 (the "License"); +-# you may not use this file except in compliance with the License. +-# You may obtain a copy of the License at +-# +-# http://www.apache.org/licenses/LICENSE-2.0 +-# +-# Unless required by applicable law or agreed to in writing, software +-# distributed under the License is distributed on an "AS IS" BASIS, +-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-# See the License for the specific language governing permissions and +-# limitations under the License. +-# +- +-import numpy as np +- +-input0 = Input("input0", "TENSOR_FLOAT32", "{1, 1, 2, 3}") +-paddings = Parameter("paddings", "TENSOR_INT32", "{4, 2}", [1, 2, +- 3, 4, +- 3, 3, +- 2, 1]) +-pad_value = Float32Scalar("pad_value", 3.9) +-output0 = Output("output0", "TENSOR_FLOAT32", "{4, 8, 8, 6}") +- +-model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0) +- +-Example({ +- input0: [1.0, 2.0, 3.0, +- 4.0, 5.0, 6.0], +- output0: np.pad([[[[1.0, 2.0, 3.0], +- [4.0, 5.0, 6.0]]]], +- [[1, 2], +- [3, 4], +- [3, 3], +- [2, 1]], +- "constant", +- constant_values=3.9).flatten().tolist(), +-}).AddVariations("float16", "relaxed") +diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims_quant8.mod.py b/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims_quant8.mod.py +deleted file mode 100644 +index 5ee4b06..0000000 +--- a/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims_quant8.mod.py ++++ /dev/null +@@ -1,40 +0,0 @@ +-# +-# Copyright (C) 2019 The Android Open Source Project +-# +-# Licensed under the Apache License, Version 2.0 (the "License"); +-# you may not use this file except in compliance with the License. +-# You may obtain a copy of the License at +-# +-# http://www.apache.org/licenses/LICENSE-2.0 +-# +-# Unless required by applicable law or agreed to in writing, software +-# distributed under the License is distributed on an "AS IS" BASIS, +-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-# See the License for the specific language governing permissions and +-# limitations under the License. +-# +- +-import numpy +- +-input0 = Input("input0", "TENSOR_QUANT8_ASYMM", "{1, 1, 2, 3}, 2.3, 4") +-paddings = Parameter("paddings", "TENSOR_INT32", "{4, 2}", [1, 2, +- 3, 4, +- 3, 3, +- 2, 1]) +-pad_value = Int32Scalar("pad_value", 3) +-output0 = Output("output0", "TENSOR_QUANT8_ASYMM", "{4, 8, 8, 6}, 2.3, 4") +- +-model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0) +- +-Example({ +- input0: [1, 2, 3, +- 4, 5, 6], +- output0: np.pad([[[[1, 2, 3], +- [4, 5, 6]]]], +- [[1, 2], +- [3, 4], +- [3, 3], +- [2, 1]], +- "constant", +- constant_values=3).flatten().tolist(), +-}) +diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank.mod.py b/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank.mod.py +deleted file mode 100644 +index 391d5cf..0000000 +--- a/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank.mod.py ++++ /dev/null +@@ -1,27 +0,0 @@ +-# +-# Copyright (C) 2019 The Android Open Source Project +-# +-# Licensed under the Apache License, Version 2.0 (the "License"); +-# you may not use this file except in compliance with the License. +-# You may obtain a copy of the License at +-# +-# http://www.apache.org/licenses/LICENSE-2.0 +-# +-# Unless required by applicable law or agreed to in writing, software +-# distributed under the License is distributed on an "AS IS" BASIS, +-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-# See the License for the specific language governing permissions and +-# limitations under the License. +-# +- +-input0 = Input("input0", "TENSOR_FLOAT32", "{3}") +-paddings = Parameter("paddings", "TENSOR_INT32", "{1, 2}", [3, 1]) +-pad_value = Float32Scalar("pad_value", 9.9) +-output0 = Output("output0", "TENSOR_FLOAT32", "{7}") +- +-model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0) +- +-Example({ +- input0: [1.0, 2.0, 3.0], +- output0: [9.9, 9.9, 9.9, 1.0, 2.0, 3.0, 9.9], +-}).AddVariations("float16") +diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank_quant8.mod.py b/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank_quant8.mod.py +deleted file mode 100644 +index b67c2b8..0000000 +--- a/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank_quant8.mod.py ++++ /dev/null +@@ -1,27 +0,0 @@ +-# +-# Copyright (C) 2019 The Android Open Source Project +-# +-# Licensed under the Apache License, Version 2.0 (the "License"); +-# you may not use this file except in compliance with the License. +-# You may obtain a copy of the License at +-# +-# http://www.apache.org/licenses/LICENSE-2.0 +-# +-# Unless required by applicable law or agreed to in writing, software +-# distributed under the License is distributed on an "AS IS" BASIS, +-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-# See the License for the specific language governing permissions and +-# limitations under the License. +-# +- +-input0 = Input("input0", "TENSOR_QUANT8_ASYMM", "{3}, 2.3, 4") +-paddings = Parameter("paddings", "TENSOR_INT32", "{1, 2}", [3, 1]) +-pad_value = Int32Scalar("pad_value", 9) +-output0 = Output("output0", "TENSOR_QUANT8_ASYMM", "{7}, 2.3, 4") +- +-model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0) +- +-Example({ +- input0: [1, 2, 3], +- output0: [9, 9, 9, 1, 2, 3, 9], +-}) +diff --git a/tests/nnapi/specs/skip/V1_2/quantize.mod.py b/tests/nnapi/specs/skip/V1_2/quantize.mod.py +deleted file mode 100644 +index a42624d..0000000 +--- a/tests/nnapi/specs/skip/V1_2/quantize.mod.py ++++ /dev/null +@@ -1,69 +0,0 @@ +-# +-# Copyright (C) 2018 The Android Open Source Project +-# +-# Licensed under the Apache License, Version 2.0 (the "License"); +-# you may not use this file except in compliance with the License. +-# You may obtain a copy of the License at +-# +-# http://www.apache.org/licenses/LICENSE-2.0 +-# +-# Unless required by applicable law or agreed to in writing, software +-# distributed under the License is distributed on an "AS IS" BASIS, +-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-# See the License for the specific language governing permissions and +-# limitations under the License. +-# +- +-import numpy as np +- +-num_values = 300 +-values = list(np.linspace(-10, 10, num_values)) +- +-for input_type in ["TENSOR_FLOAT32", "TENSOR_FLOAT16"]: +- for scale, offset in [(1.0, 0), +- (1.0, 1), +- (0.01, 120), +- (10.0, 120)]: +- input0 = Input("input0", input_type, "{%d}" % num_values) +- output0 = Output("output0", input_type, "{%d}" % num_values) +- +- model = Model().Operation("QUANTIZE", input0).To(output0) +- +- quantizeOutput = DataTypeConverter().Identify({ +- output0: ["TENSOR_QUANT8_ASYMM", scale, offset], +- }) +- +- Example({ +- input0: values, +- output0: values, +- }).AddVariations(quantizeOutput, includeDefault=False) +- +- +-# Zero-sized input +- +-# Use BOX_WITH_NMS_LIMIT op to generate a zero-sized internal tensor for box cooridnates. +-p1 = Parameter("scores", "TENSOR_FLOAT32", "{1, 2}", [0.90, 0.10]) # scores +-p2 = Parameter("roi", "TENSOR_FLOAT32", "{1, 8}", [1, 1, 10, 10, 0, 0, 10, 10]) # roi +-o1 = Output("scoresOut", "TENSOR_FLOAT32", "{0}") # scores out +-o2 = Output("classesOut", "TENSOR_INT32", "{0}") # classes out +-tmp1 = Internal("roiOut", "TENSOR_FLOAT32", "{0, 4}") # roi out +-tmp2 = Internal("batchSplitOut", "TENSOR_INT32", "{0}") # batch split out +-model = Model("zero_sized").Operation("BOX_WITH_NMS_LIMIT", p1, p2, [0], 0.3, -1, 0, 0.4, 1.0, 0.3).To(o1, tmp1, o2, tmp2) +- +-# Use ROI_ALIGN op to convert into zero-sized feature map. +-layout = BoolScalar("layout", False) # NHWC +-i1 = Input("in", "TENSOR_FLOAT32", "{1, 1, 1, 1}") +-zero_sized = Internal("featureMap", "TENSOR_FLOAT32", "{0, 2, 2, 1}") +-model = model.Operation("ROI_ALIGN", i1, tmp1, tmp2, 2, 2, 2.0, 2.0, 4, 4, layout).To(zero_sized) +- +-# QUANTIZE op with numBatches = 0. +-o3 = Output("out", "TENSOR_QUANT8_ASYMM", "{0, 2, 2, 1}, 0.1f, 128") # out +-model = model.Operation("QUANTIZE", zero_sized).To(o3) +- +-# Create test case with dummy values. +-Example({ +- i1: [1], +- o1: [0], +- o2: [0], +- o3: [0], +-}).AddVariations("relaxed", "float16") +diff --git a/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc b/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc +index 67f2467..c6c6355 100644 +--- a/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc ++++ b/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc +@@ -51,19 +51,24 @@ TEST_F(ValidationTestAddModelLoaded, output_tensorinfo) + ASSERT_EQ(tensor_info.dims[0], 1); + } + +-TEST_F(ValidationTestAddModelLoaded, neg_run_001) ++TEST_F(ValidationTestAddModelLoaded, neg_run) + { +- ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_ERROR); ++ // nnfw_prepare is not called ++ ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_INVALID_STATE); + } + +-TEST_F(ValidationTestAddModelLoaded, neg_set_input_001) ++TEST_F(ValidationTestAddModelLoaded, neg_set_input) + { +- ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR); ++ // nnfw_prepare is not called ++ ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), ++ NNFW_STATUS_INVALID_STATE); + } + +-TEST_F(ValidationTestAddModelLoaded, neg_set_output_001) ++TEST_F(ValidationTestAddModelLoaded, neg_set_output) + { +- ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR); ++ // nnfw_prepare is not called ++ ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), ++ NNFW_STATUS_INVALID_STATE); + } + + TEST_F(ValidationTestAddModelLoaded, neg_get_input_size) +@@ -81,7 +86,7 @@ TEST_F(ValidationTestAddModelLoaded, neg_load_model) + // load model twice + ASSERT_EQ(nnfw_load_model_from_file( + _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD).c_str()), +- NNFW_STATUS_ERROR); ++ NNFW_STATUS_INVALID_STATE); + } + + TEST_F(ValidationTestAddModelLoaded, neg_output_tensorinfo) +diff --git a/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc b/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc +index 1bb4182..0f4a4af 100644 +--- a/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc ++++ b/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc +@@ -102,7 +102,7 @@ TEST_F(ValidationTestAddSessionPrepared, neg_run_during_async_run) + { + SetInOutBuffers(); + ASSERT_EQ(nnfw_run_async(_session), NNFW_STATUS_NO_ERROR); +- EXPECT_EQ(nnfw_run(_session), NNFW_STATUS_ERROR); ++ EXPECT_EQ(nnfw_run(_session), NNFW_STATUS_INVALID_STATE); + ASSERT_EQ(nnfw_await(_session), NNFW_STATUS_NO_ERROR); + } + +@@ -152,13 +152,13 @@ TEST_F(ValidationTestAddSessionPrepared, neg_load_model) + // Load model twice + ASSERT_EQ(nnfw_load_model_from_file( + _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD).c_str()), +- NNFW_STATUS_ERROR); ++ NNFW_STATUS_INVALID_STATE); + } + + TEST_F(ValidationTestAddSessionPrepared, neg_prepare) + { + // Call Prepare twice +- ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR); ++ ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE); + } + + // TODO Validation check when "nnfw_run" is called without input & output tensor setting +diff --git a/tests/nnfw_api/src/ValidationTestSessionCreated.cc b/tests/nnfw_api/src/ValidationTestSessionCreated.cc +index 2675aa7..01832db 100644 +--- a/tests/nnfw_api/src/ValidationTestSessionCreated.cc ++++ b/tests/nnfw_api/src/ValidationTestSessionCreated.cc +@@ -58,7 +58,7 @@ TEST_F(ValidationTestSessionCreated, neg_load_invalid_package_1) + nnfw_load_model_from_file( + _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD_NO_MANIFEST).c_str()), + NNFW_STATUS_ERROR); +- ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR); ++ ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE); + } + + TEST_F(ValidationTestSessionCreated, neg_load_invalid_package_2) +@@ -67,52 +67,52 @@ TEST_F(ValidationTestSessionCreated, neg_load_invalid_package_2) + _session, + NNPackages::get().getModelAbsolutePath(NNPackages::ADD_INVALID_MANIFEST).c_str()), + NNFW_STATUS_ERROR); +- ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR); ++ ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE); + } + + TEST_F(ValidationTestSessionCreated, neg_prepare_001) + { + // nnfw_load_model_from_file was not called +- ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR); ++ ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE); + } + + TEST_F(ValidationTestSessionCreated, neg_run_001) + { + // nnfw_load_model_from_file and nnfw_prepare was not called +- ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_ERROR); ++ ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_INVALID_STATE); + } + + TEST_F(ValidationTestSessionCreated, neg_set_input_001) + { +- // Invalid state +- ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR); ++ ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), ++ NNFW_STATUS_INVALID_STATE); + } + + TEST_F(ValidationTestSessionCreated, neg_set_output_001) + { +- // Invalid state +- ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR); ++ ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), ++ NNFW_STATUS_INVALID_STATE); + } + + TEST_F(ValidationTestSessionCreated, neg_get_input_size) + { + uint32_t size = 10000; +- ASSERT_EQ(nnfw_input_size(_session, &size), NNFW_STATUS_ERROR); +- ASSERT_EQ(size, 10000); ++ ASSERT_EQ(nnfw_input_size(_session, &size), NNFW_STATUS_INVALID_STATE); ++ ASSERT_EQ(size, 10000); // Remain unchanged + } + + TEST_F(ValidationTestSessionCreated, neg_get_output_size) + { + uint32_t size = 10000; +- ASSERT_EQ(nnfw_output_size(_session, &size), NNFW_STATUS_ERROR); +- ASSERT_EQ(size, 10000); ++ ASSERT_EQ(nnfw_output_size(_session, &size), NNFW_STATUS_INVALID_STATE); ++ ASSERT_EQ(size, 10000); // Remain unchanged + } + + TEST_F(ValidationTestSessionCreated, neg_output_tensorinfo) + { + nnfw_tensorinfo tensor_info; + // model is not loaded +- ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, &tensor_info), NNFW_STATUS_ERROR); ++ ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, &tensor_info), NNFW_STATUS_INVALID_STATE); + // model is not loaded and tensor_info is null +- ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, nullptr), NNFW_STATUS_ERROR); ++ ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, nullptr), NNFW_STATUS_INVALID_STATE); + } +diff --git a/tests/scripts/benchmark_nnapi.sh b/tests/scripts/benchmark_nnapi.sh +index c7f44c5..af79728 100755 +--- a/tests/scripts/benchmark_nnapi.sh ++++ b/tests/scripts/benchmark_nnapi.sh +@@ -18,7 +18,6 @@ MY_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + + source $MY_PATH/common.sh + +-BENCHMARK_RUN_TEST_SH= + BENCHMARK_DRIVER_BIN= + BENCHMARK_REPORT_DIR= + BENCHMARK_MODELS_FILE= +@@ -30,7 +29,7 @@ EXECUTORS="Linear Parallel" #TODO: accept this list as argument + + function Usage() + { +- echo "Usage: ./$0 --reportdir=. --runtestsh=tests/scripts/framework/run_test.sh --driverbin=Product/out/bin/tflite_run" ++ echo "Usage: ./$0 --reportdir=. --driverbin=Product/out/bin/tflite_run" + } + + for i in "$@" +@@ -43,9 +42,6 @@ do + --test_op) + TEST_OP="true" + ;; +- --runtestsh=*) +- BENCHMARK_RUN_TEST_SH=${i#*=} +- ;; + --driverbin=*) + BENCHMARK_DRIVER_BIN=${i#*=} + ;; +@@ -147,9 +143,8 @@ function run_onert_with_all_config() + local REPORT_MODEL_DIR=$2 + local PAUSE_TIME_IN_SEC=$3 + local BENCHMARK_DRIVER_BIN=$4 +- local BENCHMARK_RUN_TEST_SH=$5 +- local EXECUTORS=$6 +- local BACKEND_LIST=$7 ++ local EXECUTORS=$5 ++ local BACKEND_LIST=$6 + + export USE_NNAPI=1 + +@@ -163,18 +158,18 @@ function run_onert_with_all_config() + done + export BACKENDS=$BACKENDS_TO_USE + if [ "$TEST_OP" == "false" ]; then +- profile_for_he_shed $REPORT_MODEL_DIR $BENCHMARK_RUN_TEST_SH $BENCHMARK_DRIVER_BIN $MODEL $PROFILING_RUN_CNT ++ profile_for_he_shed $REPORT_MODEL_DIR $BENCHMARK_DRIVER_BIN $MODEL $PROFILING_RUN_CNT + fi + + for executor in $EXECUTORS; do + export EXECUTOR=$executor + if [ "$TEST_OP" == "false" ]; then +- run_with_he_scheduler $REPORT_MODEL_DIR $BENCHMARK_RUN_TEST_SH $BENCHMARK_DRIVER_BIN $MODEL $executor ++ run_with_he_scheduler $REPORT_MODEL_DIR $BENCHMARK_DRIVER_BIN $MODEL $executor + fi + for backend in $BACKEND_LIST; do + export OP_BACKEND_ALLOPS=$backend + run_benchmark_and_print "tflite_onert_"$executor"_executor_$backend" "TFLite onert $executor Executor $backend"\ +- $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH ++ $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN + done + done + unset USE_NNAPI EXECUTOR OP_BACKEND_ALLOPS BACKENDS +@@ -215,14 +210,14 @@ function run_benchmark_test() + + # TFLite+CPU + unset USE_NNAPI +- run_benchmark_and_print "tflite_cpu" "TFLite CPU" $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH ++ run_benchmark_and_print "tflite_cpu" "TFLite CPU" $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN + + # run onert + if [ "$TEST_OP" == "true" ]; then + # Operation test don't need to test each scheduler +- run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH "Linear" "$BACKEND_LIST" ++ run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN "Linear" "$BACKEND_LIST" + else +- run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH "$EXECUTORS" "$BACKEND_LIST" ++ run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN "$EXECUTORS" "$BACKEND_LIST" + fi + + if [[ $i -ne $(echo $BENCHMARK_MODEL_LIST | wc -w)-1 ]]; then +diff --git a/tests/scripts/common.sh b/tests/scripts/common.sh +index 8800290..b2799c2 100755 +--- a/tests/scripts/common.sh ++++ b/tests/scripts/common.sh +@@ -18,13 +18,12 @@ MY_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + + function get_result_of_benchmark_test() + { +- local RUN_TEST_SH=$1 +- local DRIVER_BIN=$2 +- local MODEL=$3 +- local LOG_FILE=$4 ++ local DRIVER_BIN=$1 ++ local MODEL=$2 ++ local LOG_FILE=$3 + + local RET=0 +- $RUN_TEST_SH --driverbin="$DRIVER_BIN -r 5 -w 3" $MODEL > $LOG_FILE 2>&1 ++ $MY_PATH/framework/run_test.sh --driverbin="$DRIVER_BIN -r 5 -w 3" $MODEL > $LOG_FILE 2>&1 + RET=$? + if [[ $RET -ne 0 ]]; then + echo "Testing $MODEL aborted... exit code: $RET" +@@ -68,7 +67,7 @@ function run_benchmark_and_print() + LOG_FILE=$REPORT_MODEL_DIR/$WRITE_FILE_NAME.txt + RESULT_FILE=$REPORT_MODEL_DIR/$WRITE_FILE_NAME.result + print_with_dots $MSG +- RESULT=$(get_result_of_benchmark_test $BENCHMARK_RUN_TEST_SH $DRIVER_BIN $MODEL $LOG_FILE) ++ RESULT=$(get_result_of_benchmark_test $DRIVER_BIN $MODEL $LOG_FILE) + echo "$RESULT ms" + print_result_of_benchmark_test "$MSG" "$RESULT" $RESULT_FILE + sleep $PAUSE_TIME_IN_SEC +diff --git a/tests/scripts/framework/run_test.sh b/tests/scripts/framework/run_test.sh +index 44b7149..9440c52 100755 +--- a/tests/scripts/framework/run_test.sh ++++ b/tests/scripts/framework/run_test.sh +@@ -28,10 +28,12 @@ function Usage() + echo "Usage: ./$0 --driverbin={such as tflite_run} {tests to test or empty for all of tests}" + echo "Usage: ./$0 --driverbin=Product/out/bin/tflite_run --reportdir=report --tapname=verification.tap avgpool1 avgpool2" + echo "" +- echo "--download - (default=off) Download model files. Other options is ignored" +- echo "--driverbin - (default=../../Product/out/bin/tflite_run) runner for runnning framework tests" +- echo "--reportdir - (default=report) directory to place tap files" +- echo "--tapname - (default=framework_test.tap) file name to be written for tap" ++ echo "--download - (default=on) Download model files" ++ echo "--run - (default=on) Test model files" ++ echo "--driverbin - (default=../../Product/out/bin/tflite_run) Runner for runnning model tests" ++ echo "--reportdir - (default=report) Directory to place tap files" ++ echo "--tapname - (default=framework_test.tap) File name to be written for tap" ++ echo "--md5 - (default=on) MD5 check when download model files" + echo "" + } + +@@ -43,9 +45,13 @@ function need_download() + return 0; + fi + # Ignore checking md5 in cache ++ # TODO Use "--md5" option only and remove IGNORE_MD5 environment variable + if [ ! -z $IGNORE_MD5 ] && [ "$IGNORE_MD5" == "1" ]; then + return 1 + fi ++ if [ "$MD5_CHECK" = "off" ]; then ++ return 1 ++ fi + + LOCAL_HASH=$(md5sum $LOCAL_PATH | awk '{ print $1 }') + REMOTE_HASH=$(curl -ss $REMOTE_URL | md5sum | awk '{ print $1 }') +@@ -60,7 +66,9 @@ function need_download() + DRIVER_BIN="" + TAP_NAME="framework_test.tap" + TEST_LIST=() +-DOWNLOAD_MODE="off" ++DOWNLOAD_MODEL="on" ++RUN_TEST="on" ++MD5_CHECK="on" + + # Support environment variable setting for mirror server + FIXED_MODELFILE_SERVER="${MODELFILE_SERVER:-}" +@@ -84,6 +92,12 @@ do + --download=*) + DOWNLOAD_MODE=${i#*=} + ;; ++ --md5=*) ++ MD5_CHECK=${i#*=} ++ ;; ++ --run=*) ++ RUN_TEST=${i#*=} ++ ;; + *) + TEST_LIST+=( $i ) + ;; +@@ -100,7 +114,7 @@ if [ ! -n "$DRIVER_BIN" ]; then + fi + + # Check test driver setting +-if [ ! -e $DRIVER_BIN ] && [ "$DOWNLOAD_MODE" != "on" ]; then ++if [ ! -e $DRIVER_BIN ] && [ "$RUN_TEST" = "on" ]; then + echo "Cannot find test driver" $DRIVER_BIN ": please set proper DRIVER_BIN" + exit 1 + fi +@@ -139,33 +153,9 @@ run_tests() + + TEST_CACHE_PATH=$CACHE_ROOT_PATH/$TEST_NAME + MODELFILE=$TEST_CACHE_PATH/$MODELFILE_NAME +- MODELFILE_URL="$MODELFILE_SERVER_PATH/$MODELFILE_NAME" +- if [ -n "$FIXED_MODELFILE_SERVER" ]; then +- MODELFILE_URL="$FIXED_MODELFILE_SERVER/$MODELFILE_NAME" +- fi +- +- # Download model file +- if [ ! -e $TEST_CACHE_PATH ]; then +- mkdir -p $TEST_CACHE_PATH +- fi +- +- # Download unless we have it in cache (Also check md5sum) +- if need_download "$MODELFILE" "$MODELFILE_URL"; then +- echo "" +- echo "Download test file for $TEST_NAME" +- echo "======================" +- +- rm -f $MODELFILE # Remove invalid file if exists +- pushd $TEST_CACHE_PATH +- wget -nv $MODELFILE_URL +- if [ "${MODELFILE_NAME##*.}" == "zip" ]; then +- unzip -o $MODELFILE_NAME +- fi +- popd +- fi + + # Find model file for downloaded by zip +- if [ "${MODELFILE_NAME##*.}" == "zip" ]; then ++ if [ "${MODELFILE_NAME##*.}" = "zip" ]; then + pushd $TEST_CACHE_PATH + MODELFILE=$TEST_CACHE_PATH/$(ls *.tflite) + popd +@@ -178,7 +168,6 @@ run_tests() + # Run driver to test framework + $DRIVER_BIN $MODELFILE + +- #$DRIVER_BIN $MODELFILE + if [[ $? -eq 0 ]]; then + echo "ok $i - $TEST_NAME" >> $REPORT_DIR/$TAP_NAME + else +@@ -268,10 +257,11 @@ find_tests() + mkdir -p $REPORT_DIR + TESTS_TO_RUN=$(find_tests ${TEST_LIST[@]}) + +-if [[ "$DOWNLOAD_MODE" == "on" ]]; then ++if [ "$DOWNLOAD_MODEL" = "on" ]; then + download_tests $TESTS_TO_RUN +- exit 0; + fi + +-run_tests $TESTS_TO_RUN ++if [ "$RUN_TEST" = "on" ]; then ++ run_tests $TESTS_TO_RUN ++fi + exit $? +diff --git a/tests/scripts/test-driver.sh b/tests/scripts/test-driver.sh +index 615fc2c..a720b15 100755 +--- a/tests/scripts/test-driver.sh ++++ b/tests/scripts/test-driver.sh +@@ -38,7 +38,6 @@ function Usage() + echo "etc." + echo "--framework_driverbin - (default=../../Product/out/bin/tflite_run) runner for runnning framework tests" + echo "--verification_driverbin - (default=../../Product/out/bin/nnapi_test) runner for runnning verification tests" +- echo "--runtestsh - (default=\$ARTIFACT_PATH/tests/scripts/framework/run_test.sh) run_test.sh with path where it is for framework test and verification" + echo "--unittestdir - (default=\$ARTIFACT_PATH/Product/out/unittest) directory that has unittest binaries for unit test" + echo "" + echo "--reportdir - (default=\$ARTIFACT_PATH/report) directory to save report" +@@ -49,7 +48,6 @@ TEST_DRIVER_DIR="$( cd "$( dirname "${BASH_SOURCE}" )" && pwd )" + ARTIFACT_PATH="$TEST_DRIVER_DIR/../../" + FRAMEWORK_DRIVER_BIN="" + VERIFICATION_DRIVER_BIN="" +-RUN_TEST_SH="" + UNIT_TEST_DIR="" + ALLTEST_ON="true" + UNITTEST_ON="false" +@@ -74,9 +72,6 @@ do + --verification_driverbin=*) + VERIFICATION_DRIVER_BIN=${i#*=} + ;; +- --runtestsh=*) +- RUN_TEST_SH=${i#*=} +- ;; + --unittestdir=*) + UNIT_TEST_DIR=${i#*=} + ;; +@@ -116,15 +111,6 @@ done + + ARTIFACT_PATH="$(readlink -f $ARTIFACT_PATH)" + +-if [ -z "$RUN_TEST_SH" ]; then +- RUN_TEST_SH=$ARTIFACT_PATH/tests/scripts/framework/run_test.sh +-fi +- +-if [ ! -e "$RUN_TEST_SH" ]; then +- echo "Cannot find $RUN_TEST_SH" +- exit 1 +-fi +- + if [ -z "$UNIT_TEST_DIR" ]; then + UNIT_TEST_DIR=$ARTIFACT_PATH/Product/out/unittest + fi +@@ -149,7 +135,6 @@ if [ "$FRAMEWORKTEST_ON" == "true" ]; then + fi + + $TEST_DRIVER_DIR/test_framework.sh \ +- --runtestsh=$RUN_TEST_SH \ + --driverbin=$FRAMEWORK_DRIVER_BIN \ + --reportdir=$REPORT_DIR \ + --tapname=framework_test.tap \ +@@ -166,7 +151,6 @@ if [ "$ALLTEST_ON" == "true" ] || [ "$VERIFICATION_ON" == "true" ]; then + + # verification uses the same script as frameworktest does + $TEST_DRIVER_DIR/test_framework.sh \ +- --runtestsh=$RUN_TEST_SH \ + --driverbin=$VERIFICATION_DRIVER_BIN \ + --reportdir=$REPORT_DIR \ + --tapname=verification_test.tap \ +@@ -180,7 +164,6 @@ if [ "$BENCHMARK_ONERT_OP_ON" == "true" ]; then + + $TEST_DRIVER_DIR/benchmark_nnapi.sh \ + --test_op \ +- --runtestsh=$RUN_TEST_SH \ + --driverbin=$DRIVER_BIN \ + --reportdir=$REPORT_DIR/benchmark_op \ + --modelfilepath=$ARTIFACT_PATH/tests/scripts/framework +diff --git a/tests/scripts/test_framework.sh b/tests/scripts/test_framework.sh +index 1d97515..bd86cd3 100755 +--- a/tests/scripts/test_framework.sh ++++ b/tests/scripts/test_framework.sh +@@ -14,7 +14,8 @@ + # See the License for the specific language governing permissions and + # limitations under the License. + +-FWTEST_RUN_TEST_SH= ++MY_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" ++ + FWTEST_DRIVER_BIN= + FWTEST_REPORT_DIR= + FWTEST_TAP_NAME= +@@ -25,7 +26,6 @@ function Usage() + { + echo "Usage Example:" + echo "./$0 \\" +- echo " --runtestsh=tests/scripts/framework/run_test.sh \\ # Test runner script path" + echo " --driverbin=Product/out/bin/tflite_run \\ # Test driver path" + echo " --frameworktest_list_file=tests/scripts/list/frameworktest_list.armv7l.cpu.txt \\" + echo " --reportdir=report \\ # Directory for the report files will be saved" +@@ -42,9 +42,6 @@ do + -h|--help|help) + Usage + ;; +- --runtestsh=*) +- FWTEST_RUN_TEST_SH=${i#*=} +- ;; + --driverbin=*) + FWTEST_DRIVER_BIN=${i#*=} + ;; +@@ -67,7 +64,6 @@ do + shift + done + +-[ ! -z "$FWTEST_RUN_TEST_SH" ] || Usage + [ ! -z "$FWTEST_DRIVER_BIN" ] || Usage + [ ! -z "$FWTEST_REPORT_DIR" ] || Usage + [ ! -z "$FWTEST_TAP_NAME" ] || Usage +@@ -86,7 +82,7 @@ if [ ! -z "$FRAMEWORKTEST_LIST_FILE" ]; then + MODELLIST=$(cat "${FRAMEWORKTEST_LIST_FILE}") + fi + +-$FWTEST_RUN_TEST_SH --driverbin=$FWTEST_DRIVER_BIN \ ++$MY_PATH/framework/run_test.sh --driverbin=$FWTEST_DRIVER_BIN \ + --reportdir=$FWTEST_REPORT_DIR \ + --tapname=$FWTEST_TAP_NAME \ + ${MODELLIST:-} \ +diff --git a/tests/tools/nnpackage_run/CMakeLists.txt b/tests/tools/nnpackage_run/CMakeLists.txt +index 0e333a0..ec45db4 100644 +--- a/tests/tools/nnpackage_run/CMakeLists.txt ++++ b/tests/tools/nnpackage_run/CMakeLists.txt +@@ -33,7 +33,7 @@ target_include_directories(nnpackage_run PRIVATE src) + target_include_directories(nnpackage_run PRIVATE ${Boost_INCLUDE_DIRS}) + + target_link_libraries(nnpackage_run onert_core onert tflite_loader) +-target_link_libraries(nnpackage_run tensorflow-lite ${LIB_PTHREAD} dl nnfw_lib_tflite jsoncpp) ++target_link_libraries(nnpackage_run nnfw_lib_tflite jsoncpp) + target_link_libraries(nnpackage_run nnfw-dev) + target_link_libraries(nnpackage_run ${Boost_PROGRAM_OPTIONS_LIBRARY}) + target_link_libraries(nnpackage_run nnfw_lib_benchmark) +diff --git a/tests/tools/nnpackage_run/src/args.cc b/tests/tools/nnpackage_run/src/args.cc +index 0dbcafc..cb4a7db 100644 +--- a/tests/tools/nnpackage_run/src/args.cc ++++ b/tests/tools/nnpackage_run/src/args.cc +@@ -16,6 +16,7 @@ + + #include "args.h" + ++#include + #include + #include + +@@ -105,6 +106,75 @@ Args::Args(const int argc, char **argv) + + void Args::Initialize(void) + { ++ auto process_nnpackage = [&](const std::string &package_filename) { ++ _package_filename = package_filename; ++ ++ std::cerr << "Package Filename " << _package_filename << std::endl; ++ if (_package_filename.empty()) ++ { ++ // TODO Print usage instead of the below message ++ std::cerr << "Please specify nnpackage file. Run with `--help` for usage." ++ << "\n"; ++ ++ exit(1); ++ } ++ else ++ { ++ if (access(_package_filename.c_str(), F_OK) == -1) ++ { ++ std::cerr << "nnpackage not found: " << _package_filename << "\n"; ++ } ++ } ++ }; ++ ++ auto process_output_sizes = [&](const std::string &output_sizes_json_str) { ++ Json::Value root; ++ Json::Reader reader; ++ if (!reader.parse(output_sizes_json_str, root, false)) ++ { ++ std::cerr << "Invalid JSON format for output_sizes \"" << output_sizes_json_str << "\"\n"; ++ exit(1); ++ } ++ ++ auto arg_map = argArrayToMap(root); ++ for (auto &pair : arg_map) ++ { ++ uint32_t key = pair.first; ++ Json::Value &val_json = pair.second; ++ if (!val_json.isUInt()) ++ { ++ std::cerr << "All the values in `output_sizes` must be unsigned integers\n"; ++ exit(1); ++ } ++ uint32_t val = val_json.asUInt(); ++ _output_sizes[key] = val; ++ } ++ }; ++ ++ auto process_shape_prepare = [&](const std::string &shape_str) { ++ try ++ { ++ handleShapeParam(_shape_prepare, shape_str); ++ } ++ catch (const std::exception &e) ++ { ++ std::cerr << "error with '--shape_prepare' option: " << shape_str << std::endl; ++ exit(1); ++ } ++ }; ++ ++ auto process_shape_run = [&](const std::string &shape_str) { ++ try ++ { ++ handleShapeParam(_shape_run, shape_str); ++ } ++ catch (const std::exception &e) ++ { ++ std::cerr << "error with '--shape_run' option: " << shape_str << std::endl; ++ exit(1); ++ } ++ }; ++ + // General options + po::options_description general("General options", 100); + +@@ -112,32 +182,33 @@ void Args::Initialize(void) + general.add_options() + ("help,h", "Print available options") + ("version", "Print version and exit immediately") +- ("nnpackage", po::value()->required()) ++ ("nnpackage", po::value()->required()->notifier(process_nnpackage)) + #if defined(ONERT_HAVE_HDF5) && ONERT_HAVE_HDF5 == 1 +- ("dump,d", po::value()->default_value(""), "Output filename") +- ("load,l", po::value()->default_value(""), "Input filename") ++ ("dump,d", po::value()->default_value("")->notifier([&](const auto &v) { _dump_filename = v; }), "Output filename") ++ ("load,l", po::value()->default_value("")->notifier([&](const auto &v) { _load_filename = v; }), "Input filename") + #endif +- ("output_sizes", po::value(), ++ ("output_sizes", po::value()->notifier(process_output_sizes), + "The output buffer size in JSON 1D array\n" + "If not given, the model's output sizes are used\n" + "e.g. '[0, 40, 2, 80]' to set 0th tensor to 40 and 2nd tensor to 80.\n") +- ("num_runs,r", po::value()->default_value(1), "The number of runs") +- ("warmup_runs,w", po::value()->default_value(0), "The number of warmup runs") +- ("run_delay,t", po::value()->default_value(-1), "Delay time(ms) between runs (as default no delay") +- ("gpumem_poll,g", po::value()->default_value(false), "Check gpu memory polling separately") +- ("mem_poll,m", po::value()->default_value(false), "Check memory polling") +- ("write_report,p", po::value()->default_value(false), ++ ("num_runs,r", po::value()->default_value(1)->notifier([&](const auto &v) { _num_runs = v; }), "The number of runs") ++ ("warmup_runs,w", po::value()->default_value(0)->notifier([&](const auto &v) { _warmup_runs = v; }), "The number of warmup runs") ++ ("run_delay,t", po::value()->default_value(-1)->notifier([&](const auto &v) { _run_delay = v; }), "Delay time(ms) between runs (as default no delay") ++ ("gpumem_poll,g", po::value()->default_value(false)->notifier([&](const auto &v) { _gpumem_poll = v; }), "Check gpu memory polling separately") ++ ("mem_poll,m", po::value()->default_value(false)->notifier([&](const auto &v) { _mem_poll = v; }), "Check memory polling") ++ ("write_report,p", po::value()->default_value(false)->notifier([&](const auto &v) { _write_report = v; }), + "Write report\n" + "{exec}-{nnpkg}-{backend}.csv will be generated.\n" + "e.g. nnpackage_run-UNIT_Add_000-acl_cl.csv.\n" + "{nnpkg} name may be changed to realpath if you use symbolic-link.") +- ("shape_prepare", po::value()->default_value("[]"), ++ ("shape_prepare", po::value()->default_value("[]")->notifier(process_shape_prepare), + "set shape of specified tensor before compilation\n" + "e.g. '[0, [1, 2], 2, []]' to set 0th tensor to [1, 2] and 2nd tensor to [].\n") +- ("shape_run", po::value()->default_value("[]"), ++ ("shape_run", po::value()->default_value("[]")->notifier(process_shape_run), + "set shape of specified tensor right before running\n" + "e.g. '[1, [1, 2]]` to set 1st tensor to [1, 2].\n") +- ("verbose_level,v", po::value()->default_value(0), "Verbose level\n" ++ ("verbose_level,v", po::value()->default_value(0)->notifier([&](const auto &v) { _verbose_level = v; }), ++ "Verbose level\n" + "0: prints the only result. Messages btw run don't print\n" + "1: prints result and message btw run\n" + "2: prints all of messages to print\n") +@@ -180,158 +251,23 @@ void Args::Parse(const int argc, char **argv) + return; + } + +- po::notify(vm); + try + { +-#if defined(ONERT_HAVE_HDF5) && ONERT_HAVE_HDF5 == 1 +- if (vm.count("dump")) +- { +- _dump_filename = vm["dump"].as(); +- } +- +- if (vm.count("load")) +- { +- _load_filename = vm["load"].as(); +- } +-#endif +- +- if (vm.count("nnpackage")) +- { +- _package_filename = vm["nnpackage"].as(); +- +- if (_package_filename.empty()) +- { +- // TODO Print usage instead of the below message +- std::cerr << "Please specify nnpackage file. Run with `--help` for usage." +- << "\n"; +- +- exit(1); +- } +- else +- { +- if (access(_package_filename.c_str(), F_OK) == -1) +- { +- std::cerr << "nnpackage not found: " << _package_filename << "\n"; +- } +- } +- } +- +- if (vm.count("output_sizes")) +- { +- auto output_sizes_json_str = vm["output_sizes"].as(); +- +- Json::Value root; +- Json::Reader reader; +- if (!reader.parse(output_sizes_json_str, root, false)) +- { +- std::cerr << "Invalid JSON format for output_sizes \"" << output_sizes_json_str << "\"\n"; +- exit(1); +- } +- +- auto arg_map = argArrayToMap(root); +- for (auto &pair : arg_map) +- { +- uint32_t key = pair.first; +- Json::Value &val_json = pair.second; +- if (!val_json.isUInt()) +- { +- std::cerr << "All the values in `output_sizes` must be unsigned integers\n"; +- exit(1); +- } +- uint32_t val = val_json.asUInt(); +- _output_sizes[key] = val; +- } +- } +- +- if (vm.count("num_runs")) +- { +- _num_runs = vm["num_runs"].as(); +- } +- +- if (vm.count("warmup_runs")) +- { +- _warmup_runs = vm["warmup_runs"].as(); +- } +- +- if (vm.count("run_delay")) +- { +- _run_delay = vm["run_delay"].as(); +- } +- +- if (vm.count("gpumem_poll")) +- { +- _gpumem_poll = vm["gpumem_poll"].as(); +- } +- +- if (vm.count("mem_poll")) +- { +- _mem_poll = vm["mem_poll"].as(); +- // Instead of EXECUTE to avoid overhead, memory polling runs on WARMUP +- if (_mem_poll && _warmup_runs == 0) +- { +- _warmup_runs = 1; +- } +- } +- +- if (vm.count("write_report")) +- { +- _write_report = vm["write_report"].as(); +- } +- +- if (vm.count("verbose_level")) +- { +- _verbose_level = vm["verbose_level"].as(); +- } ++ po::notify(vm); + } + catch (const std::bad_cast &e) + { +- std::cerr << "error by bad cast" << e.what() << '\n'; ++ std::cerr << "Bad cast error - " << e.what() << '\n'; + exit(1); + } + +- if (vm.count("shape_prepare")) +- { +- std::string shape_str; +- try +- { +- shape_str = vm["shape_prepare"].as(); +- } +- catch (const std::bad_cast &e) +- { +- std::cerr << "error by bad cast with '--shape_prepare' option" << e.what() << '\n'; +- exit(1); +- } +- try +- { +- handleShapeParam(_shape_prepare, shape_str); +- } +- catch (const std::exception &e) +- { +- std::cerr << "error with '--shape_prepare' option: " << shape_str << std::endl; +- exit(1); +- } +- } +- +- if (vm.count("shape_run")) ++ // This must be run after `notify` as `_warm_up_runs` must have been processed before. ++ if (vm.count("mem_poll")) + { +- std::string shape_str; +- try +- { +- shape_str = vm["shape_run"].as(); +- } +- catch (const std::bad_cast &e) ++ // Instead of EXECUTE to avoid overhead, memory polling runs on WARMUP ++ if (_mem_poll && _warmup_runs == 0) + { +- std::cerr << "error by bad cast with '--shape_run' option" << e.what() << '\n'; +- exit(1); +- } +- try +- { +- handleShapeParam(_shape_run, shape_str); +- } +- catch (const std::exception &e) +- { +- std::cerr << "error with '--shape_run' option: " << shape_str << std::endl; +- exit(1); ++ _warmup_runs = 1; + } + } + } +diff --git a/tests/tools/nnpackage_run/src/h5formatter.cc b/tests/tools/nnpackage_run/src/h5formatter.cc +index 34c075c..09ace47 100644 +--- a/tests/tools/nnpackage_run/src/h5formatter.cc ++++ b/tests/tools/nnpackage_run/src/h5formatter.cc +@@ -145,6 +145,7 @@ void H5Formatter::dumpOutputs(const std::string &filename, std::vector= v20.05 %endif Requires(post): /sbin/ldconfig diff --git a/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.recipe b/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.recipe new file mode 100644 index 0000000..7322e90 --- /dev/null +++ b/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.recipe @@ -0,0 +1,26 @@ +operand { + name: "ifm" + type: UINT8 + shape { dim: 1 dim: 8 dim: 8 dim: 1 } + quant { min: 0 max: 255 scale: 1.0 zero_point: 0 } +} +operand { + name: "ofm" + type: UINT8 + shape { dim: 1 dim: 7 dim: 7 dim: 1 } + quant { min: 0 max: 255 scale: 1.0 zero_point: 0 } +} +operation { + type: "AveragePool2D" + averagepool2d_options { + padding: VALID + stride_w: 1 + stride_h: 1 + filter_width: 2 + filter_height: 2 + } + input: "ifm" + output: "ofm" +} +input: "ifm" +output: "ofm" diff --git a/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.reverse b/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.reverse new file mode 100644 index 0000000..e69de29 diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.recipe b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.recipe new file mode 100644 index 0000000..a09afc1 --- /dev/null +++ b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.recipe @@ -0,0 +1,44 @@ +operand { + name: "ifm" + type: FLOAT32 + shape { dim: 1 dim: 4 dim: 5 dim: 5 } +} +operand { + name: "ker" + type: FLOAT32 + shape { dim: 1 dim: 1 dim: 2 dim: 25 } +} +operand { + name: "bias" + type: FLOAT32 + shape { dim: 25 } + filler { + tag: "constant" + arg: "1.1" + } +} +operand { + name: "ofm" + type: FLOAT32 + shape { dim: 1 dim: 2 dim: 2 dim: 25 } +} +operation { + type: "DepthwiseConv2D" + version: 2 + depthwiseconv2d_options { + padding: VALID + stride_w: 2 + stride_h: 2 + dilation_w_factor: 2 + dilation_h_factor: 1 + depth_multiplier: 5 + activation : RELU6 + } + input: "ifm" + input: "ker" + input: "bias" + output: "ofm" +} +input: "ifm" +input: "ker" +output: "ofm" diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.reverse b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.reverse new file mode 100644 index 0000000..e69de29 diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.rule b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.rule new file mode 100644 index 0000000..edfabc6 --- /dev/null +++ b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.rule @@ -0,0 +1,3 @@ +# To check if DEPTHWISE_CONV_2D version is 2 + +RULE "OP_VERSION_CHECK" $(op_version DEPTHWISE_CONV_2D) '=' 2 diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.recipe b/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.recipe new file mode 100644 index 0000000..5e0b6b5 --- /dev/null +++ b/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.recipe @@ -0,0 +1,61 @@ +operand { + name: "ifm" + type: UINT8 + shape { dim: 1 dim: 112 dim: 112 dim: 4 } + quant { min: 0 max: 6 scale: 0.0235294 zero_point: 0 } +} +operand { + name: "ker" + type: UINT8 + shape { dim: 1 dim: 3 dim: 3 dim: 4 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } + quant { + min: -30.3175 min: -0.779597 min: -10.2751 min: -10.8594 + max: 4.35049 max: 2.70807 max: 11.0269 max: 20.97 + scale:0.135953 scale: 0.0136771 scale: 0.0835375 scale: 0.124821 + zero_point:223 zero_point: 57 zero_point: 123 zero_point: 87 + quantized_dimension: 3 + } +} +operand { + name: "bias" + type: INT32 + shape { dim: 4 } + filler { + tag: "gaussian" + arg: "0" + arg: "1.0" + } + quant { + scale: 1.4758e-16 scale: 3.15185e-05 scale: 2.20685e-05 scale: 1.72205e-16 + zero_point: 0 zero_point: 0 zero_point: 0 zero_point: 0 + } +} +operand { + name: "ofm" + type: UINT8 + shape { dim: 1 dim: 112 dim: 112 dim: 4 } + quant { min: 0 max: 6 scale: 0.0235294 zero_point: 0 } + +} +operation { + type: "DepthwiseConv2D" + depthwiseconv2d_options { + padding: SAME + stride_w: 1 + stride_h: 1 + depth_multiplier: 1 + activation : RELU6 + } + input: "ifm" + input: "ker" + input: "bias" + output: "ofm" +} +input: "ifm" +input: "ker" +output: "ofm" diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.reverse b/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.reverse new file mode 100644 index 0000000..e69de29 diff --git a/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.recipe b/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.recipe new file mode 100644 index 0000000..3fff5cd --- /dev/null +++ b/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.recipe @@ -0,0 +1,22 @@ +operand { + name: "ifm1" + type: UINT8 + shape { dim: 1 dim: 4 dim: 4 dim: 3 } + quant { min: 0 max: 2 scale: 0.0078125 zero_point: 128} +} +operand { + name: "ofm" + type: UINT8 + shape { dim: 1 dim: 4 dim: 4 dim: 3 } + quant { min: 0 max: 2 scale: 0.0078125 zero_point: 128} +} +operation { + type: "L2Normalize" + l2norm_options { + activation: NONE + } + input: "ifm1" + output: "ofm" +} +input: "ifm1" +output: "ofm" diff --git a/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.reverse b/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.reverse new file mode 100644 index 0000000..e69de29 diff --git a/res/TensorFlowLiteRecipes/Logistic_U8_000/test.recipe b/res/TensorFlowLiteRecipes/Logistic_U8_000/test.recipe new file mode 100644 index 0000000..7b2a84d --- /dev/null +++ b/res/TensorFlowLiteRecipes/Logistic_U8_000/test.recipe @@ -0,0 +1,19 @@ +operand { + name: "ifm" + type: UINT8 + shape { dim: 1 dim: 3 dim: 3 dim: 2 } + quant { min: 0 max: 1 scale: 0.00390625 zero_point: -128 } +} +operand { + name: "ofm" + type: UINT8 + shape { dim: 1 dim: 3 dim: 3 dim: 2 } + quant { min: 0 max: 1 scale: 0.00390625 zero_point: -128 } +} +operation { + type: "Logistic" + input: "ifm" + output: "ofm" +} +input: "ifm" +output: "ofm" diff --git a/res/TensorFlowLiteRecipes/Logistic_U8_000/test.reverse b/res/TensorFlowLiteRecipes/Logistic_U8_000/test.reverse new file mode 100644 index 0000000..e69de29 diff --git a/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe b/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe index 79271a4..1313e26 100644 --- a/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe +++ b/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe @@ -10,7 +10,7 @@ operand { operand { name: "ker" type: FLOAT32 - shape { dim: 1 dim: 3 dim: 3 dim: 1 } + shape { dim: 3 dim: 1 dim: 1 dim: 3 } filler { tag: "gaussian" arg: "0.0" diff --git a/res/TensorFlowLiteRecipes/Unique_000/test.recipe b/res/TensorFlowLiteRecipes/Unique_000/test.recipe new file mode 100644 index 0000000..887380c --- /dev/null +++ b/res/TensorFlowLiteRecipes/Unique_000/test.recipe @@ -0,0 +1,27 @@ +operand { + name: "ifm" + type: FLOAT32 + shape { dim: 4 } +} +operand { + name: "ofm" + type: FLOAT32 + shape { } +} +operand { + name: "ofm_idx" + type: INT32 + shape { dim: 4 } +} +operation { + type: "Unique" + unique_options { + idx_out_type: INT32 + } + input: "ifm" + output: "ofm" + output: "ofm_idx" +} +input: "ifm" +output: "ofm" +output: "ofm_idx" diff --git a/res/TensorFlowLiteRecipes/Unique_000/test.reverse b/res/TensorFlowLiteRecipes/Unique_000/test.reverse new file mode 100644 index 0000000..e69de29 diff --git a/res/TensorFlowLiteRecipes/Unique_001/test.recipe b/res/TensorFlowLiteRecipes/Unique_001/test.recipe new file mode 100644 index 0000000..9beb516 --- /dev/null +++ b/res/TensorFlowLiteRecipes/Unique_001/test.recipe @@ -0,0 +1,27 @@ +operand { + name: "ifm" + type: FLOAT32 + shape { dim: 4 } +} +operand { + name: "ofm" + type: FLOAT32 + shape { } +} +operand { + name: "ofm_idx" + type: INT64 + shape { dim: 4 } +} +operation { + type: "Unique" + unique_options { + idx_out_type: INT64 + } + input: "ifm" + output: "ofm" + output: "ofm_idx" +} +input: "ifm" +output: "ofm" +output: "ofm_idx" diff --git a/res/TensorFlowLiteRecipes/Unique_001/test.reverse b/res/TensorFlowLiteRecipes/Unique_001/test.reverse new file mode 100644 index 0000000..e69de29 diff --git a/res/TensorFlowLiteRecipes/Unique_002/test.recipe b/res/TensorFlowLiteRecipes/Unique_002/test.recipe new file mode 100644 index 0000000..67b947f --- /dev/null +++ b/res/TensorFlowLiteRecipes/Unique_002/test.recipe @@ -0,0 +1,27 @@ +operand { + name: "ifm" + type: INT32 + shape { dim: 5 } +} +operand { + name: "ofm" + type: INT32 + shape { } +} +operand { + name: "ofm_idx" + type: INT32 + shape { dim: 5 } +} +operation { + type: "Unique" + unique_options { + idx_out_type: INT32 + } + input: "ifm" + output: "ofm" + output: "ofm_idx" +} +input: "ifm" +output: "ofm" +output: "ofm_idx" diff --git a/res/TensorFlowLiteRecipes/Unique_002/test.reverse b/res/TensorFlowLiteRecipes/Unique_002/test.reverse new file mode 100644 index 0000000..e69de29 diff --git a/res/TensorFlowLiteRecipes/Unique_003/test.recipe b/res/TensorFlowLiteRecipes/Unique_003/test.recipe new file mode 100644 index 0000000..375db66 --- /dev/null +++ b/res/TensorFlowLiteRecipes/Unique_003/test.recipe @@ -0,0 +1,27 @@ +operand { + name: "ifm" + type: INT32 + shape { dim: 5 } +} +operand { + name: "ofm" + type: INT32 + shape { } +} +operand { + name: "ofm_idx" + type: INT64 + shape { dim: 5 } +} +operation { + type: "Unique" + unique_options { + idx_out_type: INT64 + } + input: "ifm" + output: "ofm" + output: "ofm_idx" +} +input: "ifm" +output: "ofm" +output: "ofm_idx" diff --git a/res/TensorFlowLiteRecipes/Unique_003/test.reverse b/res/TensorFlowLiteRecipes/Unique_003/test.reverse new file mode 100644 index 0000000..e69de29 diff --git a/res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe b/res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe new file mode 100644 index 0000000..d3985e4 --- /dev/null +++ b/res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe @@ -0,0 +1,28 @@ +operand { + name: "ifm" + type: UINT8 + shape { dim: 4 } + quant { min: 0 max: 255 scale: 1.0 zero_point: 0 } +} +operand { + name: "ofm" + type: UINT8 + shape { } +} +operand { + name: "ofm_idx" + type: INT32 + shape { dim: 4 } +} +operation { + type: "Unique" + unique_options { + idx_out_type: INT32 + } + input: "ifm" + output: "ofm" + output: "ofm_idx" +} +input: "ifm" +output: "ofm" +output: "ofm_idx" diff --git a/res/TensorFlowLiteRecipes/Unique_U8_000/test.reverse b/res/TensorFlowLiteRecipes/Unique_U8_000/test.reverse new file mode 100644 index 0000000..e69de29 diff --git a/res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe b/res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe new file mode 100644 index 0000000..b08dd85 --- /dev/null +++ b/res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe @@ -0,0 +1,28 @@ +operand { + name: "ifm" + type: UINT8 + shape { dim: 5 } + quant { min: 0 max: 255 scale: 1.0 zero_point: 0 } +} +operand { + name: "ofm" + type: UINT8 + shape { } +} +operand { + name: "ofm_idx" + type: INT64 + shape { dim: 5 } +} +operation { + type: "Unique" + unique_options { + idx_out_type: INT64 + } + input: "ifm" + output: "ofm" + output: "ofm_idx" +} +input: "ifm" +output: "ofm" +output: "ofm_idx" diff --git a/res/TensorFlowLiteRecipes/Unique_U8_001/test.reverse b/res/TensorFlowLiteRecipes/Unique_U8_001/test.reverse new file mode 100644 index 0000000..e69de29 diff --git a/runtime/libs/benchmark/CMakeLists.txt b/runtime/libs/benchmark/CMakeLists.txt index 2af0ffa..748b2d1 100644 --- a/runtime/libs/benchmark/CMakeLists.txt +++ b/runtime/libs/benchmark/CMakeLists.txt @@ -1,6 +1,5 @@ file(GLOB_RECURSE SOURCES "src/*.cpp") -add_library(nnfw_lib_benchmark SHARED ${SOURCES}) +add_library(nnfw_lib_benchmark STATIC ${SOURCES}) target_include_directories(nnfw_lib_benchmark PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include) target_link_libraries(nnfw_lib_benchmark PRIVATE ${LIB_PTHREAD}) -install(TARGETS nnfw_lib_benchmark DESTINATION lib) diff --git a/runtime/libs/benchmark/src/Result.cpp b/runtime/libs/benchmark/src/Result.cpp index 7a3f9a5..df573da 100644 --- a/runtime/libs/benchmark/src/Result.cpp +++ b/runtime/libs/benchmark/src/Result.cpp @@ -166,7 +166,7 @@ Result::Result(const Phases &phases) if (option.memory) { print_memory = true; - for (int i = PhaseEnum::MODEL_LOAD; i <= PhaseEnum::EXECUTE; ++i) + for (int i = PhaseEnum::MODEL_LOAD; i < PhaseEnum::EXECUTE; ++i) { auto phase = phases.at(gPhaseStrings[i]); for (int j = MemoryType::RSS; j <= MemoryType::PSS; ++j) diff --git a/runtime/onert/api/include/nnfw.h b/runtime/onert/api/include/nnfw.h index 031aabd..03a3aed 100644 --- a/runtime/onert/api/include/nnfw.h +++ b/runtime/onert/api/include/nnfw.h @@ -99,6 +99,8 @@ typedef enum { NNFW_STATUS_ERROR = 1, /** Unexpected null argument is given. */ NNFW_STATUS_UNEXPECTED_NULL = 2, + /** When a function was called but it is not valid for the current session state. */ + NNFW_STATUS_INVALID_STATE = 3, } NNFW_STATUS; /** @@ -432,10 +434,10 @@ NNFW_STATUS nnfw_output_tensorinfo(nnfw_session *session, uint32_t index, * *

Supported backends differs on each platforms. * For example, `x86_64` supports "cpu" only. - * Can set multiple backends by semicolon (ex: "acl_cl;cpu"). - * Among the multiple backends, the 1st element is used as default backend.

- * - * @note Possible backend strings are: "cpu", "acl_cl", "acl_neon", "srcn" + * Multiple backends can be set and they must be separated by a semicolon (ex: "acl_cl;cpu"). + * For each backend string, `libbackend_{backend}.so` will be dynamically loaded during + * {@link nnfw_prepare}. + * Among the multiple backends, the 1st element is used as the default backend.

* * @param[in] session session to which avilable backends are set * @param[in] backends available backends on which nnfw uses @@ -449,12 +451,10 @@ NNFW_STATUS nnfw_set_available_backends(nnfw_session *session, const char *backe * * This function should be called before {@link nnfw_prepare} is invoked. * - *

Supported backends differs on each platforms. - * For example, `x86_64` supports "cpu" only. - * The backend for op has higher priority than available backends specified by - * nnfw_set_available_backends.

+ *

The backend for op has higher priority than available backends specified by + * {@link nnfw_set_available_backends}.

* - * @note Possible backend strings are: "cpu", "acl_cl", "acl_neon" + * @deprecated Deprecated since 1.8.0. * * @param[in] session session to be modified * @param[in] op operation to be set diff --git a/runtime/onert/api/src/nnfw_api.cc b/runtime/onert/api/src/nnfw_api.cc index 0747583..34a46ed 100644 --- a/runtime/onert/api/src/nnfw_api.cc +++ b/runtime/onert/api/src/nnfw_api.cc @@ -31,6 +31,7 @@ STATIC_ASSERT_ENUM_CHECK(NNFW_TYPE_TENSOR_INT64, 5); STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_NO_ERROR, 0); STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_ERROR, 1); STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_UNEXPECTED_NULL, 2); +STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_INVALID_STATE, 3); STATIC_ASSERT_ENUM_CHECK(NNFW_LAYOUT_NONE, 0); STATIC_ASSERT_ENUM_CHECK(NNFW_LAYOUT_CHANNELS_LAST, 1); diff --git a/runtime/onert/api/src/nnfw_api_internal.cc b/runtime/onert/api/src/nnfw_api_internal.cc index d03ddd4..b3390fa 100644 --- a/runtime/onert/api/src/nnfw_api_internal.cc +++ b/runtime/onert/api/src/nnfw_api_internal.cc @@ -76,7 +76,7 @@ nnfw_session::~nnfw_session() = default; NNFW_STATUS nnfw_session::load_model_from_file(const char *package_dir) { if (!isStateInitialized()) - return NNFW_STATUS_ERROR; + return NNFW_STATUS_INVALID_STATE; if (!package_dir) { @@ -156,7 +156,7 @@ NNFW_STATUS nnfw_session::prepare() std::cerr << "invalid state"; } std::cerr << std::endl; - return NNFW_STATUS_ERROR; + return NNFW_STATUS_INVALID_STATE; } if (!_subgraphs || !primary_subgraph() || primary_subgraph()->isBuildingPhase()) @@ -188,7 +188,7 @@ NNFW_STATUS nnfw_session::run() { std::cerr << "Error during nnfw_session::run : " << "run should be run after prepare" << std::endl; - return NNFW_STATUS_ERROR; + return NNFW_STATUS_INVALID_STATE; } try @@ -211,7 +211,7 @@ NNFW_STATUS nnfw_session::run_async() { std::cerr << "Error during nnfw_session::run_async : " << "run_async should be run after prepare" << std::endl; - return NNFW_STATUS_ERROR; + return NNFW_STATUS_INVALID_STATE; } _execution->startExecute(); @@ -241,7 +241,7 @@ NNFW_STATUS nnfw_session::set_input(uint32_t index, NNFW_TYPE /*type*/, const vo if (!isStatePreparedOrFinishedRun()) { std::cerr << "Error during nnfw_session::set_input : invalid state" << std::endl; - return NNFW_STATUS_ERROR; + return NNFW_STATUS_INVALID_STATE; } if (!buffer && length != 0) @@ -270,7 +270,7 @@ NNFW_STATUS nnfw_session::set_output(uint32_t index, NNFW_TYPE /*type*/, void *b if (!isStatePreparedOrFinishedRun()) { std::cerr << "Error during nnfw_session::set_output : invalid state" << std::endl; - return NNFW_STATUS_ERROR; + return NNFW_STATUS_INVALID_STATE; } if (!buffer && length != 0) @@ -296,7 +296,7 @@ NNFW_STATUS nnfw_session::set_output(uint32_t index, NNFW_TYPE /*type*/, void *b NNFW_STATUS nnfw_session::input_size(uint32_t *number) { if (isStateInitialized()) // Model is not loaded - return NNFW_STATUS_ERROR; + return NNFW_STATUS_INVALID_STATE; try { @@ -318,7 +318,7 @@ NNFW_STATUS nnfw_session::input_size(uint32_t *number) NNFW_STATUS nnfw_session::output_size(uint32_t *number) { if (isStateInitialized()) // Model is not loaded - return NNFW_STATUS_ERROR; + return NNFW_STATUS_INVALID_STATE; try { @@ -410,7 +410,7 @@ NNFW_STATUS nnfw_session::apply_tensorinfo(uint32_t index, nnfw_tensorinfo ti) { std::cerr << "Error during set_input_tensorinfo : should be run after load_model" << std::endl; - return NNFW_STATUS_ERROR; + return NNFW_STATUS_INVALID_STATE; } if (ti.rank <= 0 || ti.rank > NNFW_MAX_RANK) @@ -463,6 +463,9 @@ NNFW_STATUS nnfw_session::set_input_tensorinfo(uint32_t index, const nnfw_tensor NNFW_STATUS nnfw_session::input_tensorinfo(uint32_t index, nnfw_tensorinfo *ti) { + if (isStateInitialized()) + return NNFW_STATUS_INVALID_STATE; + try { if (ti == nullptr) @@ -499,7 +502,7 @@ NNFW_STATUS nnfw_session::input_tensorinfo(uint32_t index, nnfw_tensorinfo *ti) NNFW_STATUS nnfw_session::output_tensorinfo(uint32_t index, nnfw_tensorinfo *ti) { if (isStateInitialized()) - return NNFW_STATUS_ERROR; + return NNFW_STATUS_INVALID_STATE; if (ti == nullptr) { @@ -570,7 +573,7 @@ static std::string get_op_backend_string(std::string op) NNFW_STATUS nnfw_session::set_available_backends(const char *backends) { if (!isStateModelLoaded()) - return NNFW_STATUS_ERROR; + return NNFW_STATUS_INVALID_STATE; try { @@ -596,7 +599,7 @@ NNFW_STATUS nnfw_session::set_available_backends(const char *backends) NNFW_STATUS nnfw_session::set_op_backend(const char *op, const char *backend) { if (!isStateModelLoaded()) - return NNFW_STATUS_ERROR; + return NNFW_STATUS_INVALID_STATE; try { @@ -627,7 +630,7 @@ NNFW_STATUS nnfw_session::set_op_backend(const char *op, const char *backend) NNFW_STATUS nnfw_session::set_config(const char *key, const char *value) { if (!isStateModelLoaded()) - return NNFW_STATUS_ERROR; + return NNFW_STATUS_INVALID_STATE; auto &options = _compiler->options(); @@ -693,7 +696,7 @@ onert::ir::Graph *nnfw_session::primary_subgraph() NNFW_STATUS nnfw_session::get_config(const char *key, char *value, size_t value_size) { if (!isStateModelLoaded()) - return NNFW_STATUS_ERROR; + return NNFW_STATUS_INVALID_STATE; auto &options = _compiler->options(); diff --git a/runtime/onert/backend/acl_cl/KernelGenerator.cc b/runtime/onert/backend/acl_cl/KernelGenerator.cc index 3ca4058..4ab2d4c 100644 --- a/runtime/onert/backend/acl_cl/KernelGenerator.cc +++ b/runtime/onert/backend/acl_cl/KernelGenerator.cc @@ -31,6 +31,7 @@ #include "exec/FunctionSequence.h" #include "util/logging.h" #include "util/Utils.h" +#include "AclKernelGen.h" namespace onert { @@ -76,15 +77,15 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node) const auto block_size_index{ node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto block_size_alloc = _tensor_builder->at(block_size_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto block_size_tensor = _tensor_builder->at(block_size_index).get(); assert(_ctx.at(block_size_index).data()); auto fn = std::make_unique<::arm_compute::CLBatchToSpaceLayer>(); - fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), ofm_alloc->handle()); + fn->configure(ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -96,15 +97,27 @@ void KernelGenerator::visit(const ir::operation::Cast &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - const auto input_sub_type = _ctx.at(ifm_index).typeInfo().type() == ir::DataType::BOOL8 - ? arm_compute::SubDataType::BOOL - : arm_compute::SubDataType::NONE; + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); - auto fn = std::make_unique<::arm_compute::CLCast>(); + std::unique_ptr<::arm_compute::IFunction> fn; + if (ifm_tensor->data_type() == ofm_tensor->data_type()) + { + auto l = std::make_unique<::arm_compute::CLCopy>(); + + l->configure(ifm_tensor->handle(), ofm_tensor->handle()); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), input_sub_type); + fn = std::move(l); + } + else + { + auto l = std::make_unique<::arm_compute::CLCast>(); + + // TODO Support converting float to int32 as round down + l->configure(ifm_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE); + + fn = std::move(l); + } auto acl_fn = asAclClFunction(std::move(fn)); @@ -132,10 +145,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) ker_width, ker_height); const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto ker_alloc = _tensor_builder->at(ker_index).get(); - auto bias_alloc = _tensor_builder->at(bias_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto ker_tensor = _tensor_builder->at(ker_index).get(); + auto bias_tensor = _tensor_builder->at(bias_index).get(); const auto conv_info = acl_common::asPadStrideInfo(padding, stride); const auto act_info = acl_common::asActivationLayerInfo(activation); @@ -143,8 +156,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) auto fn = std::make_unique<::arm_compute::CLConvolutionLayer>( _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); - fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(), - conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info); + fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), + ofm_tensor->handle(), conv_info, ::arm_compute::WeightsInfo(), + ::arm_compute::Size2D(1U, 1U), act_info); _return_fn = asAclClFunction(std::move(fn)); } @@ -171,10 +185,10 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) const auto multiplier = node.param().multiplier; const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto ker_alloc = _tensor_builder->at(ker_index).get(); - auto bias_alloc = _tensor_builder->at(bias_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto ker_tensor = _tensor_builder->at(ker_index).get(); + auto bias_tensor = _tensor_builder->at(bias_index).get(); const auto conv_info = acl_common::asPadStrideInfo(padding, stride); const auto act_info = acl_common::asActivationLayerInfo(activation); @@ -182,8 +196,8 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) { auto fn = std::make_unique<::arm_compute::CLDepthwiseConvolutionLayer>(); - fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), - ofm_alloc->handle(), conv_info, multiplier, act_info); + fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), + ofm_tensor->handle(), conv_info, multiplier, act_info); _return_fn = asAclClFunction(std::move(fn)); } @@ -217,19 +231,20 @@ void KernelGenerator::visit(const ir::operation::MaxPool2D &node) VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl; VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); - ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX, - ::arm_compute::Size2D{kw, kh}, - acl_common::asPadStrideInfo(padding, stride)}; + ::arm_compute::PoolingLayerInfo info{ + ::arm_compute::PoolingType::MAX, ::arm_compute::Size2D{kw, kh}, + ifm_tensor->info()->data_layout(), acl_common::asPadStrideInfo(padding, stride)}; auto fn = std::make_unique<::arm_compute::CLPoolingLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info); _return_fn = std::make_unique( - asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclClFunction(std::move(fn)), + ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::AvgPool2D &node) @@ -260,19 +275,21 @@ void KernelGenerator::visit(const ir::operation::AvgPool2D &node) VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl; VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); ::arm_compute::PoolingLayerInfo info{ ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh}, - acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */}; + ifm_tensor->info()->data_layout(), acl_common::asPadStrideInfo(padding, stride), + true /* exclude_padding */}; auto fn = std::make_unique<::arm_compute::CLPoolingLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info); _return_fn = std::make_unique( - asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclClFunction(std::move(fn)), + ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::Concat &node) @@ -296,7 +313,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node) return; } - auto output_alloc = _tensor_builder->at(ofm_index).get(); + auto output_tensor = _tensor_builder->at(ofm_index).get(); std::vector<::arm_compute::ICLTensor *> input_tensors; for (auto &ifm_ind : input_indexes) input_tensors.emplace_back(_tensor_builder->at(ifm_ind)->handle()); @@ -305,7 +322,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node) if (input_indexes.size() < 2) { auto l = std::make_unique<::arm_compute::CLCopy>(); - l->configure(input_tensors.at(0), output_alloc->handle()); + l->configure(input_tensors.at(0), output_tensor->handle()); fn = std::move(l); } else @@ -313,10 +330,10 @@ void KernelGenerator::visit(const ir::operation::Concat &node) auto l = std::make_unique<::arm_compute::CLConcatenateLayer>(); const auto rank = _ctx.at(ofm_index).shape().rank(); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = output_alloc->layout(); + const auto backend_layout = output_tensor->layout(); const auto fixed_axis = acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value(); - l->configure(input_tensors, output_alloc->handle(), fixed_axis); + l->configure(input_tensors, output_tensor->handle(), fixed_axis); fn = std::move(l); } @@ -327,75 +344,15 @@ void KernelGenerator::visit(const ir::operation::Concat &node) void KernelGenerator::visit(const ir::operation::FullyConnected &node) { - using ir::operation::FullyConnected; - const auto output_index{node.getOutputs().at(0)}; - const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)}; - const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)}; - const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)}; - - const auto input_rank = _ctx.at(input_index).shape().rank(); - - const auto output_size = - _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 1); - UNUSED_RELEASE(output_size); - assert(_ctx.at(bias_index).shape().dim(0) == output_size); - assert(_ctx.at(weight_index).shape().dim(0) == output_size); - const auto batch_size = - _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 2); - const auto input_size = - _ctx.at(weight_index).shape().dim(_ctx.at(weight_index).shape().rank() - 1); - - // Check for reshaping input's shape into rank-2 - bool needs_reshape = false; - ir::Shape reshape(2); - if (input_rank == 3 || input_rank == 4) - { - const auto &ifm_shape = _ctx.at(input_index).shape(); - auto feature_size = 1; - for (int i = 0; i < ifm_shape.rank(); ++i) - { - feature_size *= ifm_shape.dim(i); - } - - UNUSED_RELEASE(feature_size); - assert(feature_size == batch_size * input_size); - - // for reshaping - needs_reshape = true; - reshape.dim(0) = batch_size; /* H */ - reshape.dim(1) = input_size; /* W */ - } - + auto output_tensor = _tensor_builder->at(output_index).get(); const auto activation = node.param().activation; - auto output_alloc = _tensor_builder->at(output_index).get(); - const auto input_alloc = _tensor_builder->at(input_index).get(); - const auto weight_alloc = _tensor_builder->at(weight_index).get(); - const auto bias_alloc = _tensor_builder->at(bias_index).get(); - const auto frontend_layout = _current_op_seq_layout; - const auto acl_layout = output_alloc->handle()->info()->data_layout(); - - auto fn = std::make_unique( - _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); - - arm_compute::CLFullyConnectedReshapingLayer::KernelType kernel_type = - arm_compute::CLFullyConnectedReshapingLayer::KernelType::GENERAL; - if (_ctx.at(weight_index).isConstant()) - { - kernel_type = arm_compute::CLFullyConnectedReshapingLayer::KernelType::PREPROCESSED_WEIGHTS; - assert(_ctx.at(weight_index).data()); - } - fn->configure( - input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(), output_alloc->handle(), - needs_reshape, - ::onert::backend::acl_common::asTensorShape( - reshape, frontend_layout, ::onert::backend::acl_common::asRuntimeLayout(acl_layout)), - kernel_type); - + auto fn = acl_common::kernelGenFullyConnected( + node, _ctx, _tensor_builder, _current_op_seq_layout); _return_fn = std::make_unique( - asAclClFunction(std::move(fn)), - ActivationBuilder::generate(activation, output_alloc->handle())); + std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle())); } void KernelGenerator::visit(const ir::operation::Mul &node) @@ -406,17 +363,18 @@ void KernelGenerator::visit(const ir::operation::Mul &node) const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::CLPixelWiseMultiplication>(); - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN); _return_fn = std::make_unique( - asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclClFunction(std::move(fn)), + ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::Reduce &node) @@ -427,14 +385,14 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) const auto keep_dims{node.param().keep_dims}; const auto reduce_type = node.param().reduce_type; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); // Convert to ACL axes taking into account negative values and possible duplicates. const auto &axes = _ctx.at(axes_index); const auto input_rank = _ctx.at(input_index).shape().rank(); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = input_alloc->layout(); + const auto backend_layout = input_tensor->layout(); std::unique_ptr fn; if (reduce_type == ir::operation::Reduce::ReduceType::MEAN) @@ -443,7 +401,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) const auto acl_axes = acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout); - l->configure(input_alloc->handle(), acl_axes, keep_dims, output_alloc->handle()); + l->configure(input_tensor->handle(), acl_axes, keep_dims, output_tensor->handle()); fn = std::move(l); } @@ -453,7 +411,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); const auto acl_axes = acl_common::asSet(axes, input_rank, frontend_layout, backend_layout); - l->configure(input_alloc->handle(), output_alloc->handle(), acl_axes, keep_dims, + l->configure(input_tensor->handle(), output_tensor->handle(), acl_axes, keep_dims, acl_common::convertReduceType(reduce_type)); fn = std::move(l); @@ -469,13 +427,13 @@ void KernelGenerator::visit(const ir::operation::Reshape &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); // NOTE This operation must not be changed the layout from frontend to backend // So, PermutationOperationPass makes layouts of frontend and backend the same. const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = output_alloc->layout(); + const auto backend_layout = output_tensor->layout(); assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) || frontend_layout == backend_layout); UNUSED_RELEASE(frontend_layout); @@ -483,7 +441,7 @@ void KernelGenerator::visit(const ir::operation::Reshape &node) auto fn = std::make_unique<::arm_compute::CLReshapeLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle()); + fn->configure(input_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -503,10 +461,10 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node) (void)dims; (void)ndim; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique(); - fn->configure(input_alloc->handle(), output_alloc->handle()); + fn->configure(input_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); _return_fn = std::move(acl_fn); } @@ -516,15 +474,15 @@ void KernelGenerator::visit(const ir::operation::Tanh &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f}; - fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + fn->configure(input_tensor->handle(), output_tensor->handle(), act_info); auto acl_fn = asAclClFunction(std::move(fn)); @@ -538,13 +496,13 @@ void KernelGenerator::visit(const ir::operation::Softmax &node) const auto beta = node.param().beta; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique<::arm_compute::CLSoftmaxLayer>( _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); - fn->configure(input_alloc->handle(), output_alloc->handle(), beta); + fn->configure(input_tensor->handle(), output_tensor->handle(), beta); auto acl_fn = asAclClFunction(std::move(fn)); @@ -558,10 +516,10 @@ void KernelGenerator::visit(const ir::operation::Slice &node) const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)}; const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)}; - auto outputData_alloc = _tensor_builder->at(output_index).get(); - auto inputData_alloc = _tensor_builder->at(input_index).get(); + auto outputData_tensor = _tensor_builder->at(output_index).get(); + auto inputData_tensor = _tensor_builder->at(input_index).get(); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = inputData_alloc->layout(); + const auto backend_layout = inputData_tensor->layout(); // Set initializers for indices data such as order of inputData int input_rank = _ctx.at(input_index).shape().rank(); @@ -613,7 +571,7 @@ void KernelGenerator::visit(const ir::operation::Slice &node) auto fn = std::make_unique<::arm_compute::CLSlice>(); - fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set); + fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set); auto acl_fn = asAclClFunction(std::move(fn)); @@ -628,10 +586,10 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node) const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)}; const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)}; - auto outputData_alloc = _tensor_builder->at(output_index).get(); - auto inputData_alloc = _tensor_builder->at(input_index).get(); + auto outputData_tensor = _tensor_builder->at(output_index).get(); + auto inputData_tensor = _tensor_builder->at(input_index).get(); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = inputData_alloc->layout(); + const auto backend_layout = inputData_tensor->layout(); // Set initializers for indices data such as order of inputData int input_rank = _ctx.at(input_index).shape().rank(); @@ -704,7 +662,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node) auto fn = std::make_unique<::arm_compute::CLStridedSlice>(); - fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set, + fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set, begin_mask, end_mask, shrink_axis_mask); auto acl_fn = asAclClFunction(std::move(fn)); @@ -720,10 +678,10 @@ void KernelGenerator::visit(const ir::operation::Transpose &node) const auto rank = _ctx.at(ifm_idx).shape().rank(); - auto ofm_alloc = _tensor_builder->at(ofm_idx).get(); - auto ifm_alloc = _tensor_builder->at(ifm_idx).get(); + auto ofm_tensor = _tensor_builder->at(ofm_idx).get(); + auto ifm_tensor = _tensor_builder->at(ifm_idx).get(); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = ifm_alloc->layout(); + const auto backend_layout = ifm_tensor->layout(); std::vector pv(perm.cbegin(), perm.cend()); // Reversed @@ -732,7 +690,7 @@ void KernelGenerator::visit(const ir::operation::Transpose &node) auto fn = std::make_unique<::arm_compute::CLPermute>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), backend_pv); auto acl_fn = asAclClFunction(std::move(fn)); @@ -747,17 +705,18 @@ void KernelGenerator::visit(const ir::operation::Add &node) const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::CLArithmeticAddition>(); - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE); _return_fn = std::make_unique( - asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclClFunction(std::move(fn)), + ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::Sub &node) @@ -768,17 +727,18 @@ void KernelGenerator::visit(const ir::operation::Sub &node) const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::CLArithmeticSubtraction>(); - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE); _return_fn = std::make_unique( - asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclClFunction(std::move(fn)), + ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::Div &node) @@ -789,16 +749,17 @@ void KernelGenerator::visit(const ir::operation::Div &node) const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::CLArithmeticDivision>(); - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); _return_fn = std::make_unique( - asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclClFunction(std::move(fn)), + ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::Exp &node) @@ -806,12 +767,12 @@ void KernelGenerator::visit(const ir::operation::Exp &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique<::arm_compute::CLExpLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle()); + fn->configure(input_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -823,12 +784,12 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique<::arm_compute::CLReshapeLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle()); + fn->configure(input_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -842,20 +803,21 @@ void KernelGenerator::visit(const ir::operation::InstanceNorm &node) const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)}; const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto gamma_alloc = _tensor_builder->at(gamma_index).get(); - auto beta_alloc = _tensor_builder->at(beta_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto gamma_tensor = _tensor_builder->at(gamma_index).get(); + auto beta_tensor = _tensor_builder->at(beta_index).get(); auto epsilon = node.param().epsilon; auto activation = node.param().activation; auto fn = std::make_unique<::arm_compute::CLInstanceNormalizationLayerEx>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), gamma_alloc->handle(), - beta_alloc->handle(), epsilon); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), + beta_tensor->handle(), epsilon); _return_fn = std::make_unique( - asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclClFunction(std::move(fn)), + ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::Logistic &node) @@ -863,15 +825,15 @@ void KernelGenerator::visit(const ir::operation::Logistic &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC}; auto fn = std::make_unique<::arm_compute::CLActivationLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info); auto acl_fn = asAclClFunction(std::move(fn)); @@ -884,13 +846,13 @@ void KernelGenerator::visit(const ir::operation::LogicalAnd &node) const auto input0_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT0)}; const auto input1_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT1)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input0_alloc = _tensor_builder->at(input0_index).get(); - auto input1_alloc = _tensor_builder->at(input1_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input0_tensor = _tensor_builder->at(input0_index).get(); + auto input1_tensor = _tensor_builder->at(input1_index).get(); auto fn = std::make_unique<::arm_compute::CLBinaryLogicalOp>(); - fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(), + fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(), ::arm_compute::BinaryLogicalOperation::AND); auto acl_fn = asAclClFunction(std::move(fn)); @@ -900,159 +862,8 @@ void KernelGenerator::visit(const ir::operation::LogicalAnd &node) void KernelGenerator::visit(const ir::operation::LSTM &node) { - // TODO Support dynamic rnn - // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection. - const auto scratch_buffer_index{ - node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)}; - const auto output_state_out_index{ - node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)}; - const auto cell_state_out_index{ - node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)}; - const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)}; - - const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)}; - const auto input_to_input_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional - const auto input_to_forget_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)}; - const auto input_to_cell_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)}; - const auto input_to_output_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)}; - const auto recurrent_to_input_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional - const auto recurrent_to_forget_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)}; - const auto recurrent_to_cell_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)}; - const auto recurrent_to_output_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)}; - const auto cell_to_input_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional - const auto cell_to_forget_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional - const auto cell_to_output_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional - const auto input_gate_bias_index{ - node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)}; - const auto forget_gate_bias_index{ - node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)}; - const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)}; - const auto output_gate_bias_index{ - node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)}; - const auto projection_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional - const auto projection_bias_index{ - node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional - const auto output_state_in_index{ - node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)}; - const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)}; - const auto cell_threshold = node.param().cell_threshold; - const auto projection_threshold = node.param().projection_threshold; - - bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 && - _ctx.at(input_to_input_weights_index).shape().dim(1) != 0; - bool has_recurrent_to_input_weights = - _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 && - _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0; - bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0; - bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0; - bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 && - _ctx.at(projection_weights_index).shape().dim(1) != 0; - bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0); - - // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG. - // true: no CIFG - // false: CIFG - // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG). - bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights; - - // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole. - // But the cell_to_input_weights does not exist in regular CIFG although peephole. - // true: peephole - // false: no peephole - bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights; - - // NOTE Although the projection weights has data the projection bias may not have data. - bool has_projection_param = has_projection_weights; - - const auto activation = node.param().activation; - const auto cell_clip = cell_threshold; - const auto projection_clip = projection_threshold; - assert(cell_clip >= 0.f && projection_clip >= 0.f); - - auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get(); - auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get(); - auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get(); - auto output_alloc = _tensor_builder->at(output_index).get(); - - auto input_alloc = _tensor_builder->at(input_index).get(); - - auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get(); - auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get(); - auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get(); - auto recurrent_to_forget_weights_alloc = - _tensor_builder->at(recurrent_to_forget_weights_index).get(); - auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get(); - auto recurrent_to_output_weights_alloc = - _tensor_builder->at(recurrent_to_output_weights_index).get(); - - auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get(); - auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get(); - auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get(); - auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get(); - auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get(); - - auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation); - - auto fn = std::make_unique<::arm_compute::CLLSTMLayer>(); - - ::arm_compute::LSTMParams<::arm_compute::ICLTensor> lstm_params{}; - if (has_cifg_param) - { - auto input_to_input_weights_alloc = - _tensor_builder->at(input_to_input_weights_index).get(); // optional - auto recurrent_to_input_weights_alloc = - _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional - auto cell_to_input_weights_handle = - has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle() - : nullptr; // optional (non-cifg && peephole) - auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional - lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(), - recurrent_to_input_weights_alloc->handle(), - cell_to_input_weights_handle, input_gate_bias_alloc->handle()); - } - if (has_peephole_param) - { - auto cell_to_forget_weights_alloc = - _tensor_builder->at(cell_to_forget_weights_index).get(); // optional - auto cell_to_output_weights_alloc = - _tensor_builder->at(cell_to_output_weights_index).get(); // optional - lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(), - cell_to_output_weights_alloc->handle()); - } - if (has_projection_param) - { - auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional - auto projection_bias_handle = has_projection_bias - ? _tensor_builder->at(projection_bias_index).get()->handle() - : nullptr; // optional - lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle); - } - - fn->configure( - input_alloc->handle(), input_to_forget_weights_alloc->handle(), - input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(), - recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(), - recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(), - cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(), - cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(), - output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(), - lstm_params, act_info, cell_clip, projection_clip); - - auto acl_fn = asAclClFunction(std::move(fn)); - - _return_fn = std::move(acl_fn); + _return_fn = acl_common::kernelGenLSTM(node, _ctx, _tensor_builder); } void KernelGenerator::visit(const ir::operation::Comparison &node) @@ -1063,13 +874,13 @@ void KernelGenerator::visit(const ir::operation::Comparison &node) const auto comparison_type = node.param().comparison_type; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input0_alloc = _tensor_builder->at(input0_index).get(); - auto input1_alloc = _tensor_builder->at(input1_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input0_tensor = _tensor_builder->at(input0_index).get(); + auto input1_tensor = _tensor_builder->at(input1_index).get(); auto fn = std::make_unique<::arm_compute::CLComparison>(); - fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(), + fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(), (arm_compute::ComparisonOperation)comparison_type); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1107,13 +918,13 @@ void KernelGenerator::visit(const ir::operation::Pack &node) for (const auto &input_index : input_indexes) { size_t input_rank = _ctx.at(input_index).shape().rank(); - const auto &input_alloc = _tensor_builder->at(input_index); - orig_inputs_acl_tensor_shapes.emplace_back(input_alloc->info()->tensor_shape()); - assert(input_rank == input_alloc->num_dimensions()); - if (input_rank != input_alloc->info()->num_dimensions()) + const auto &input_tensor = _tensor_builder->at(input_index); + orig_inputs_acl_tensor_shapes.emplace_back(input_tensor->info()->tensor_shape()); + assert(input_rank == input_tensor->num_dimensions()); + if (input_rank != input_tensor->info()->num_dimensions()) { // This means that high dimension's value is 1 and ifm tensor is applied dim_correction - input_alloc->info()->set_tensor_shape(acl_common::asTensorShape( + input_tensor->info()->set_tensor_shape(acl_common::asTensorShape( _ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false)); } } @@ -1135,8 +946,8 @@ void KernelGenerator::visit(const ir::operation::Permute &node) const auto ofm_idx{node.getOutputs().at(0)}; const auto ifm_idx{node.getInputs().at(0)}; const auto permute_type = node.getPermuteType(); - auto ofm_alloc = _tensor_builder->at(ofm_idx).get(); - auto ifm_alloc = _tensor_builder->at(ifm_idx).get(); + auto ofm_tensor = _tensor_builder->at(ofm_idx).get(); + auto ifm_tensor = _tensor_builder->at(ifm_idx).get(); const auto rank = _ctx.at(ofm_idx).shape().rank(); assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank()); @@ -1149,7 +960,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node) auto l = std::make_unique<::arm_compute::CLPermute>(); - l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv); + l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv); fn = std::move(l); } @@ -1160,7 +971,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node) auto l = std::make_unique<::arm_compute::CLPermute>(); - l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv); + l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv); fn = std::move(l); } @@ -1168,7 +979,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node) { auto l = std::make_unique<::arm_compute::CLCopy>(); - l->configure(ifm_alloc->handle(), ofm_alloc->handle()); + l->configure(ifm_tensor->handle(), ofm_tensor->handle()); fn = std::move(l); } @@ -1183,12 +994,12 @@ void KernelGenerator::visit(const ir::operation::RSQRT &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); auto fn = std::make_unique<::arm_compute::CLRsqrtLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle()); _return_fn = asAclClFunction(std::move(fn)); } @@ -1198,15 +1009,15 @@ void KernelGenerator::visit(const ir::operation::ReLU &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::ReLU::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU}; - fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + fn->configure(input_tensor->handle(), output_tensor->handle(), act_info); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1219,12 +1030,12 @@ void KernelGenerator::visit(const ir::operation::ResizeBilinear &node) const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); auto fn = std::make_unique<::arm_compute::CLScale>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE, ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT); @@ -1238,15 +1049,15 @@ void KernelGenerator::visit(const ir::operation::ReLU1 &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::ReLU1::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f}; auto fn = std::make_unique<::arm_compute::CLActivationLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1258,15 +1069,15 @@ void KernelGenerator::visit(const ir::operation::ReLU6 &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::ReLU6::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f}; auto fn = std::make_unique<::arm_compute::CLActivationLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1288,25 +1099,25 @@ void KernelGenerator::visit(const ir::operation::RNN &node) const auto activation = node.param().activation; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto hidden_state_out_tensor = _tensor_builder->at(hidden_state_out_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); - auto weights_alloc = _tensor_builder->at(weights_index).get(); - auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get(); - auto bias_alloc = _tensor_builder->at(bias_index).get(); - auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); + auto weights_tensor = _tensor_builder->at(weights_index).get(); + auto recurrent_weights_tensor = _tensor_builder->at(recurrent_weights_index).get(); + auto bias_tensor = _tensor_builder->at(bias_index).get(); + auto hidden_state_in_tensor = _tensor_builder->at(hidden_state_in_index).get(); auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation); auto copy_layer = std::make_unique<::arm_compute::CLCopy>(); - copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle()); + copy_layer->configure(hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle()); _return_fn = asAclClFunction(std::move(copy_layer)); - auto fn = std::make_unique<::arm_compute::CLRNNLayerEx>( + auto fn = std::make_unique<::arm_compute::CLRNNLayer>( _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); - fn->configure(input_alloc->handle(), weights_alloc->handle(), recurrent_weights_alloc->handle(), - bias_alloc->handle(), hidden_state_out_alloc->handle(), output_alloc->handle(), - act_info); + fn->configure(input_tensor->handle(), weights_tensor->handle(), + recurrent_weights_tensor->handle(), bias_tensor->handle(), + hidden_state_out_tensor->handle(), output_tensor->handle(), act_info); _return_fn = asAclClFunction(std::move(fn)); } @@ -1315,12 +1126,12 @@ void KernelGenerator::visit(const ir::operation::Floor &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); auto fn = std::make_unique<::arm_compute::CLFloor>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1335,10 +1146,10 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node) node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)}; const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto block_size_alloc = _tensor_builder->at(block_size_index).get(); - auto paddings_alloc = _tensor_builder->at(paddings_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto block_size_tensor = _tensor_builder->at(block_size_index).get(); + auto paddings_tensor = _tensor_builder->at(paddings_index).get(); assert(_ctx.at(block_size_index).data()); assert(_ctx.at(paddings_index).data()); @@ -1346,8 +1157,8 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node) std::unique_ptr<::arm_compute::IFunction> fn; auto l = std::make_unique<::arm_compute::CLSpaceToBatchLayer>(); - l->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(), - ofm_alloc->handle()); + l->configure(ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(), + ofm_tensor->handle()); fn = std::move(l); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1362,12 +1173,12 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node) auto block_size = node.param().block_size; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); - auto fn = std::make_unique<::arm_compute::CLSpaceToDepth>(); + auto fn = std::make_unique<::arm_compute::CLSpaceToDepthLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), block_size); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1389,19 +1200,21 @@ void KernelGenerator::visit(const ir::operation::L2Pool2D &node) ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); ::arm_compute::PoolingLayerInfo info{ ::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh}, + ifm_tensor->info()->data_layout(), ::onert::backend::acl_common::asPadStrideInfo(padding, stride)}; auto fn = std::make_unique<::arm_compute::CLPoolingLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info); _return_fn = std::make_unique( - asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclClFunction(std::move(fn)), + ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node) @@ -1410,13 +1223,13 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node) const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)}; const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto lookups_alloc = _tensor_builder->at(lookups_index).get(); - auto values_alloc = _tensor_builder->at(values_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto lookups_tensor = _tensor_builder->at(lookups_index).get(); + auto values_tensor = _tensor_builder->at(values_index).get(); auto fn = std::make_unique<::arm_compute::CLEmbeddingLookup>(); - fn->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle()); + fn->configure(values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1442,15 +1255,15 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node) float beta = 0.5f; // pow(reduction, -0.5) = 1 / sqrt(reduction) float bias = 0.0f; // Don't offset the reduction. - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP, radius, alpha, beta, bias, false); auto fn = std::make_unique<::arm_compute::CLNormalizationLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1466,17 +1279,17 @@ void KernelGenerator::visit(const ir::operation::HashtableLookup &node) const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)}; const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto hits_alloc = _tensor_builder->at(hits_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto hits_tensor = _tensor_builder->at(hits_index).get(); - auto lookups_alloc = _tensor_builder->at(lookups_index).get(); - auto keys_alloc = _tensor_builder->at(keys_index).get(); - auto values_alloc = _tensor_builder->at(values_index).get(); + auto lookups_tensor = _tensor_builder->at(lookups_index).get(); + auto keys_tensor = _tensor_builder->at(keys_index).get(); + auto values_tensor = _tensor_builder->at(values_index).get(); auto fn = std::make_unique<::arm_compute::CLHashtableLookup>(); - fn->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(), - output_alloc->handle(), hits_alloc->handle()); + fn->configure(lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(), + output_tensor->handle(), hits_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1489,13 +1302,13 @@ void KernelGenerator::visit(const ir::operation::PReLU &node) const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)}; const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto alpha_alloc = _tensor_builder->at(alpha_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto alpha_tensor = _tensor_builder->at(alpha_index).get(); - auto fn = std::make_unique<::arm_compute::CLPReLU>(); + auto fn = std::make_unique<::arm_compute::CLPReluLayer>(); - fn->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle()); + fn->configure(ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1518,7 +1331,6 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node) (node.param().padding.type == ir::PaddingType::VALID)); auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride, ker_shape.W, ker_shape.H); - uint32_t invalid_horizontal = 0; uint32_t invalid_vertical = 0; if (node.param().padding.type == ir::PaddingType::VALID) @@ -1528,17 +1340,17 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node) invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1); } - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto ker_alloc = _tensor_builder->at(ker_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto ker_tensor = _tensor_builder->at(ker_index).get(); const auto tconv_info = acl_common::asPadStrideInfo(padding, stride); auto fn = std::make_unique<::arm_compute::CLTransposeConvLayer>( _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); - fn->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info, - invalid_horizontal, invalid_vertical); + fn->configure(ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(), + tconv_info, invalid_horizontal, invalid_vertical); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1550,15 +1362,15 @@ void KernelGenerator::visit(const ir::operation::SQRT &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::SQRT::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT}; auto fn = std::make_unique<::arm_compute::CLActivationLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + fn->configure(input_tensor->handle(), output_tensor->handle(), act_info); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1571,13 +1383,13 @@ void KernelGenerator::visit(const ir::operation::LogicalOr &node) const auto input0_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT0)}; const auto input1_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT1)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input0_alloc = _tensor_builder->at(input0_index).get(); - auto input1_alloc = _tensor_builder->at(input1_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input0_tensor = _tensor_builder->at(input0_index).get(); + auto input1_tensor = _tensor_builder->at(input1_index).get(); auto fn = std::make_unique<::arm_compute::CLBitwiseOr>(); - fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle()); + fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1589,12 +1401,12 @@ void KernelGenerator::visit(const ir::operation::LogicalNot &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::LogicalNot::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique<::arm_compute::CLBitwiseNot>(); - fn->configure(input_alloc->handle(), output_alloc->handle()); + fn->configure(input_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1607,13 +1419,13 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node) const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::CLElementwiseSquaredDiff>(); - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1634,13 +1446,13 @@ void KernelGenerator::visit(const ir::operation::TopKV2 &node) const auto k = node.param().k; - auto values_alloc = _tensor_builder->at(outputValues_index).get(); - auto indices_alloc = _tensor_builder->at(outputIndices_index).get(); - auto input_alloc = _tensor_builder->at(inputData_index).get(); + auto values_tensor = _tensor_builder->at(outputValues_index).get(); + auto indices_tensor = _tensor_builder->at(outputIndices_index).get(); + auto input_tensor = _tensor_builder->at(inputData_index).get(); auto fn = std::make_unique<::arm_compute::CLTopKV2>(); - fn->configure(input_alloc->handle(), k, values_alloc->handle(), indices_alloc->handle()); + fn->configure(input_tensor->handle(), k, values_tensor->handle(), indices_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1659,9 +1471,9 @@ void KernelGenerator::visit(const ir::operation::Gather &node) const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw); const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value(); - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto indices_alloc = _tensor_builder->at(indices_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto indices_tensor = _tensor_builder->at(indices_index).get(); // NOTE The frontend layout and backend layout must be the same for this operation. // If not the same, we have to add a stage(?) to perform permutation of output tensor. It @@ -1671,43 +1483,43 @@ void KernelGenerator::visit(const ir::operation::Gather &node) // a model. For example, if a model in NHWC has this operation as output rank == 4, indices // rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W // and C are not sequential in NCHW. So the backend in NCHW cannot handle this case. - const auto backend_layout = ofm_alloc->layout(); + const auto backend_layout = ofm_tensor->layout(); UNUSED_RELEASE(backend_layout); - assert(backend_layout == ifm_alloc->layout()); - assert(backend_layout == indices_alloc->layout()); + assert(backend_layout == ifm_tensor->layout()); + assert(backend_layout == indices_tensor->layout()); assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout); auto fn = std::make_unique<::arm_compute::CLGatherEx>(); // input is n-D, indices k-D, output is (n + k - 1)-D size_t n = ifm_rank; - assert(n == ifm_alloc->num_dimensions()); + assert(n == ifm_tensor->num_dimensions()); size_t k = _ctx.at(indices_index).shape().rank(); - assert(k == indices_alloc->num_dimensions()); + assert(k == indices_tensor->num_dimensions()); // Disable applied dim_correction - const auto orig_ifm_acl_tensor_shape = ifm_alloc->info()->tensor_shape(); - if (n != ifm_alloc->info()->num_dimensions()) + const auto orig_ifm_acl_tensor_shape = ifm_tensor->info()->tensor_shape(); + if (n != ifm_tensor->info()->num_dimensions()) { // This means that high dimension's value is 1 and ifm tensor is applied dim_correction const auto ifm = _ctx.at(ifm_index); - ifm_alloc->info()->set_tensor_shape( + ifm_tensor->info()->set_tensor_shape( acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false)); } - const auto orig_indice_acl_tensor_shape = indices_alloc->info()->tensor_shape(); - if (k != indices_alloc->info()->num_dimensions()) + const auto orig_indice_acl_tensor_shape = indices_tensor->info()->tensor_shape(); + if (k != indices_tensor->info()->num_dimensions()) { // This means that high dimension's value is 1 and indices tensor is applied dim_correction const auto indices = _ctx.at(indices_index); - indices_alloc->info()->set_tensor_shape( + indices_tensor->info()->set_tensor_shape( acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false)); } - fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis); + fn->configure(ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis); // Revert disabling applied dim_correction - ifm_alloc->info()->set_tensor_shape(orig_ifm_acl_tensor_shape); - indices_alloc->info()->set_tensor_shape(orig_indice_acl_tensor_shape); + ifm_tensor->info()->set_tensor_shape(orig_ifm_acl_tensor_shape); + indices_tensor->info()->set_tensor_shape(orig_indice_acl_tensor_shape); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1719,12 +1531,12 @@ void KernelGenerator::visit(const ir::operation::Neg &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); auto fn = std::make_unique<::arm_compute::CLNeg>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1736,15 +1548,15 @@ void KernelGenerator::visit(const ir::operation::Abs &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS}; auto fn = std::make_unique<::arm_compute::CLActivationLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + fn->configure(input_tensor->handle(), output_tensor->handle(), act_info); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1761,11 +1573,11 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node) assert((ifm_shape.rank() - 1) == ofm_shape.rank()); - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); const auto ifm_rank = _ctx.at(ifm_index).shape().rank(); auto frontend_layout = _current_op_seq_layout; - auto backend_layout = ifm_alloc->layout(); + auto backend_layout = ifm_tensor->layout(); int axis_value = node.param().axis; if (axis_value < 0) @@ -1776,10 +1588,10 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node) auto acl_axis = acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value(); - auto fn = std::make_unique<::arm_compute::CLArgOperation>(); + auto fn = std::make_unique<::arm_compute::CLArgMinMaxLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), {acl_axis}, - ::arm_compute::ArgOperation::MAX); + fn->configure(ifm_tensor->handle(), acl_axis, ofm_tensor->handle(), + ::arm_compute::ReductionOperation::ARG_IDX_MAX); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1791,12 +1603,12 @@ void KernelGenerator::visit(const ir::operation::Dequantize &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Dequantize::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); - auto fn = std::make_unique<::arm_compute::CLCast>(); + auto fn = std::make_unique<::arm_compute::CLDequantizationLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle(), arm_compute::SubDataType::NONE); + fn->configure(input_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1814,15 +1626,15 @@ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &nod auto beta = node.param().beta; auto bias = node.param().bias; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); const auto norm_info = ::arm_compute::NormalizationLayerInfo( ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false); auto fn = std::make_unique<::arm_compute::CLNormalizationLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1837,12 +1649,12 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node) auto block_size = node.param().block_size; assert(block_size > 0); - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); - auto fn = std::make_unique<::arm_compute::CLDepthToSpace>(); + auto fn = std::make_unique<::arm_compute::CLDepthToSpaceLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle(), block_size); + fn->configure(input_tensor->handle(), output_tensor->handle(), block_size); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1860,13 +1672,13 @@ void KernelGenerator::visit(const ir::operation::Split &node) for (const auto &output : node.getOutputs()) output_indexes.emplace_back(output); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - std::vector output_allocs; + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + std::vector output_tensors; for (const auto &ofm_ind : output_indexes) - output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle()); + output_tensors.emplace_back(_tensor_builder->at(ofm_ind).get()->handle()); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = ifm_alloc->layout(); + const auto backend_layout = ifm_tensor->layout(); auto axis = node.param().axis; if (axis < 0) axis += ifm_rank; @@ -1874,7 +1686,7 @@ void KernelGenerator::visit(const ir::operation::Split &node) auto fn = std::make_unique<::arm_compute::CLSplit>(); - fn->configure(ifm_alloc->handle(), output_allocs, axis); + fn->configure(ifm_tensor->handle(), output_tensors, axis); _return_fn = asAclClFunction(std::move(fn)); } @@ -1906,13 +1718,13 @@ void KernelGenerator::visit(const ir::operation::Unpack &node) for (const auto &output_index : output_indexes) { size_t output_rank = _ctx.at(output_index).shape().rank(); - const auto &output_alloc = _tensor_builder->at(output_index); - orig_outputs_acl_tensor_shapes.emplace_back(output_alloc->info()->tensor_shape()); - assert(output_rank == output_alloc->num_dimensions()); - if (output_rank != output_alloc->info()->num_dimensions()) + const auto &output_tensor = _tensor_builder->at(output_index); + orig_outputs_acl_tensor_shapes.emplace_back(output_tensor->info()->tensor_shape()); + assert(output_rank == output_tensor->num_dimensions()); + if (output_rank != output_tensor->info()->num_dimensions()) { // This means that high dimension's value is 1 and ifm tensor is applied dim_correction - output_alloc->info()->set_tensor_shape(acl_common::asTensorShape( + output_tensor->info()->set_tensor_shape(acl_common::asTensorShape( _ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false)); } } @@ -1959,12 +1771,12 @@ void KernelGenerator::visit(const ir::operation::Pad &node) // Disable applied dim_correction size_t input_rank = _ctx.at(input_index).shape().rank(); - const auto &input_alloc = _tensor_builder->at(input_index); - assert(input_rank == input_alloc->num_dimensions()); - if (input_rank != input_alloc->info()->num_dimensions()) + const auto &input_tensor = _tensor_builder->at(input_index); + assert(input_rank == input_tensor->num_dimensions()); + if (input_rank != input_tensor->info()->num_dimensions()) { // This means that high dimension's value is 1 and ifm tensor is applied dim_correction - input_alloc->info()->set_tensor_shape(acl_common::asTensorShape( + input_tensor->info()->set_tensor_shape(acl_common::asTensorShape( _ctx.at(input_index).shape(), frontend_layout, backend_layout, false)); } @@ -1982,13 +1794,13 @@ void KernelGenerator::visit(const ir::operation::Min &node) const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::CLElementwiseMin>(); - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -2001,13 +1813,13 @@ void KernelGenerator::visit(const ir::operation::Max &node) const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::CLElementwiseMax>(); - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -2019,12 +1831,12 @@ void KernelGenerator::visit(const ir::operation::ConvertFp32ToFp16 &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp32ToFp16::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); auto fn = std::make_unique<::arm_compute::CLDepthConvertLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), ::arm_compute::ConvertPolicy::SATURATE, + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0); auto acl_fn = asAclClFunction(std::move(fn)); @@ -2037,12 +1849,12 @@ void KernelGenerator::visit(const ir::operation::ConvertFp16ToFp32 &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp16ToFp32::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); auto fn = std::make_unique<::arm_compute::CLDepthConvertLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), ::arm_compute::ConvertPolicy::SATURATE, + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0); auto acl_fn = asAclClFunction(std::move(fn)); diff --git a/runtime/onert/backend/acl_common/AclKernelGen.h b/runtime/onert/backend/acl_common/AclKernelGen.h new file mode 100644 index 0000000..6253434 --- /dev/null +++ b/runtime/onert/backend/acl_common/AclKernelGen.h @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_ +#define __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_ + +#include +#include + +#include + +namespace onert +{ +namespace backend +{ +namespace acl_common +{ + +template +std::unique_ptr +kernelGenLSTM(const ir::operation::LSTM &node, const ir::Operands &operands, + const std::shared_ptr &tensor_builder) +{ + // TODO Support dynamic rnn + // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection. + const auto scratch_buffer_index{ + node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)}; + const auto output_state_out_index{ + node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)}; + const auto cell_state_out_index{ + node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)}; + const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)}; + + const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)}; + const auto input_to_input_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional + const auto input_to_forget_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)}; + const auto input_to_cell_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)}; + const auto input_to_output_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)}; + const auto recurrent_to_input_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional + const auto recurrent_to_forget_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)}; + const auto recurrent_to_cell_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)}; + const auto recurrent_to_output_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)}; + const auto cell_to_input_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional + const auto cell_to_forget_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional + const auto cell_to_output_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional + const auto input_gate_bias_index{ + node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)}; + const auto forget_gate_bias_index{ + node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)}; + const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)}; + const auto output_gate_bias_index{ + node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)}; + const auto projection_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional + const auto projection_bias_index{ + node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional + const auto output_state_in_index{ + node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)}; + const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)}; + const auto cell_threshold = node.param().cell_threshold; + const auto projection_threshold = node.param().projection_threshold; + + bool has_input_to_input_weights = operands.at(input_to_input_weights_index).shape().dim(0) != 0 && + operands.at(input_to_input_weights_index).shape().dim(1) != 0; + bool has_recurrent_to_input_weights = + operands.at(recurrent_to_input_weights_index).shape().dim(0) != 0 && + operands.at(recurrent_to_input_weights_index).shape().dim(1) != 0; + bool has_cell_to_forget_weights = operands.at(cell_to_forget_weights_index).shape().dim(0) != 0; + bool has_cell_to_output_weights = operands.at(cell_to_output_weights_index).shape().dim(0) != 0; + bool has_projection_weights = operands.at(projection_weights_index).shape().dim(0) != 0 && + operands.at(projection_weights_index).shape().dim(1) != 0; + bool has_projection_bias = operands.at(projection_bias_index).shape().dim(0); + + // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG. + // true: no CIFG + // false: CIFG + // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG). + bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights; + + // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole. + // But the cell_to_input_weights does not exist in regular CIFG although peephole. + // true: peephole + // false: no peephole + bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights; + + // NOTE Although the projection weights has data the projection bias may not have data. + bool has_projection_param = has_projection_weights; + + const auto activation = node.param().activation; + const auto cell_clip = cell_threshold; + const auto projection_clip = projection_threshold; + assert(cell_clip >= 0.f && projection_clip >= 0.f); + + auto scratch_buffer_tensor = tensor_builder->at(scratch_buffer_index).get(); + auto output_state_out_tensor = tensor_builder->at(output_state_out_index).get(); + auto cell_state_out_tensor = tensor_builder->at(cell_state_out_index).get(); + auto output_tensor = tensor_builder->at(output_index).get(); + + auto input_tensor = tensor_builder->at(input_index).get(); + + auto input_to_forget_weights_tensor = tensor_builder->at(input_to_forget_weights_index).get(); + auto input_to_cell_weights_tensor = tensor_builder->at(input_to_cell_weights_index).get(); + auto input_to_output_weights_tensor = tensor_builder->at(input_to_output_weights_index).get(); + auto recurrent_to_forget_weights_tensor = + tensor_builder->at(recurrent_to_forget_weights_index).get(); + auto recurrent_to_cell_weights_tensor = tensor_builder->at(recurrent_to_cell_weights_index).get(); + auto recurrent_to_output_weights_tensor = + tensor_builder->at(recurrent_to_output_weights_index).get(); + + auto forget_gate_bias_tensor = tensor_builder->at(forget_gate_bias_index).get(); + auto cell_bias_tensor = tensor_builder->at(cell_bias_index).get(); + auto output_gate_bias_tensor = tensor_builder->at(output_gate_bias_index).get(); + auto output_state_in_tensor = tensor_builder->at(output_state_in_index).get(); + auto cell_state_in_tensor = tensor_builder->at(cell_state_in_index).get(); + + auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation); + + auto fn = std::make_unique(); + + ::arm_compute::LSTMParams lstm_params{}; + if (has_cifg_param) + { + auto input_to_input_weights_tensor = + tensor_builder->at(input_to_input_weights_index).get(); // optional + auto recurrent_to_input_weights_tensor = + tensor_builder->at(recurrent_to_input_weights_index).get(); // optional + auto cell_to_input_weights_handle = + has_peephole_param ? tensor_builder->at(cell_to_input_weights_index).get()->handle() + : nullptr; // optional (non-cifg && peephole) + auto input_gate_bias_tensor = tensor_builder->at(input_gate_bias_index).get(); // optional + lstm_params.set_cifg_params(input_to_input_weights_tensor->handle(), + recurrent_to_input_weights_tensor->handle(), + cell_to_input_weights_handle, input_gate_bias_tensor->handle()); + } + if (has_peephole_param) + { + auto cell_to_forget_weights_tensor = + tensor_builder->at(cell_to_forget_weights_index).get(); // optional + auto cell_to_output_weights_tensor = + tensor_builder->at(cell_to_output_weights_index).get(); // optional + lstm_params.set_peephole_params(cell_to_forget_weights_tensor->handle(), + cell_to_output_weights_tensor->handle()); + } + if (has_projection_param) + { + auto projection_weights_tensor = tensor_builder->at(projection_weights_index).get(); // optional + auto projection_bias_handle = has_projection_bias + ? tensor_builder->at(projection_bias_index).get()->handle() + : nullptr; // optional + lstm_params.set_projection_params(projection_weights_tensor->handle(), projection_bias_handle); + } + + fn->configure(input_tensor->handle(), input_to_forget_weights_tensor->handle(), + input_to_cell_weights_tensor->handle(), input_to_output_weights_tensor->handle(), + recurrent_to_forget_weights_tensor->handle(), + recurrent_to_cell_weights_tensor->handle(), + recurrent_to_output_weights_tensor->handle(), forget_gate_bias_tensor->handle(), + cell_bias_tensor->handle(), output_gate_bias_tensor->handle(), + output_state_in_tensor->handle(), cell_state_in_tensor->handle(), + scratch_buffer_tensor->handle(), output_state_out_tensor->handle(), + cell_state_out_tensor->handle(), output_tensor->handle(), lstm_params, act_info, + cell_clip, projection_clip); + + return std::make_unique(std::move(fn)); +} + +template +std::unique_ptr +kernelGenFullyConnected(const ir::operation::FullyConnected &node, const ir::Operands &operands, + const std::shared_ptr &tensor_builder, ir::Layout layout) +{ + using ir::operation::FullyConnected; + + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)}; + const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)}; + const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)}; + + const auto input_rank = operands.at(input_index).shape().rank(); + + const auto output_size = + operands.at(output_index).shape().dim(operands.at(output_index).shape().rank() - 1); + UNUSED_RELEASE(output_size); + assert(operands.at(bias_index).shape().dim(0) == output_size); + assert(operands.at(weight_index).shape().dim(0) == output_size); + const auto batch_size = + operands.at(output_index).shape().dim(operands.at(output_index).shape().rank() - 2); + const auto input_size = + operands.at(weight_index).shape().dim(operands.at(weight_index).shape().rank() - 1); + + // Check for reshaping input's shape into rank-2 + bool needs_reshape = false; + ir::Shape reshape(2); + if (input_rank == 3 || input_rank == 4) + { + const auto &ifm_shape = operands.at(input_index).shape(); + auto feature_size = 1; + for (int i = 0; i < ifm_shape.rank(); ++i) + { + feature_size *= ifm_shape.dim(i); + } + + UNUSED_RELEASE(feature_size); + assert(feature_size == batch_size * input_size); + + // for reshaping + needs_reshape = true; + reshape.dim(0) = batch_size; /* H */ + reshape.dim(1) = input_size; /* W */ + } + + auto output_tensor = tensor_builder->at(output_index).get(); + const auto input_tensor = tensor_builder->at(input_index).get(); + const auto weight_tensor = tensor_builder->at(weight_index).get(); + const auto bias_tensor = tensor_builder->at(bias_index).get(); + const auto frontend_layout = layout; + const auto acl_layout = output_tensor->handle()->info()->data_layout(); + + auto fn = + std::make_unique(tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + + typename T_ACLLayer::KernelType kernel_type = T_ACLLayer::KernelType::GENERAL; + if (operands.at(weight_index).isConstant()) + { + kernel_type = T_ACLLayer::KernelType::PREPROCESSED_WEIGHTS; + assert(operands.at(weight_index).data()); + } + + fn->configure( + input_tensor->handle(), weight_tensor->handle(), bias_tensor->handle(), + output_tensor->handle(), needs_reshape, + ::onert::backend::acl_common::asTensorShape( + reshape, frontend_layout, ::onert::backend::acl_common::asRuntimeLayout(acl_layout)), + kernel_type); + + return std::make_unique(std::move(fn)); +} + +} // namespace acl_common +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_ diff --git a/runtime/onert/backend/acl_neon/KernelGenerator.cc b/runtime/onert/backend/acl_neon/KernelGenerator.cc index e471867..37ec993 100644 --- a/runtime/onert/backend/acl_neon/KernelGenerator.cc +++ b/runtime/onert/backend/acl_neon/KernelGenerator.cc @@ -31,6 +31,7 @@ #include "exec/NopFunction.h" #include "util/logging.h" #include "util/Utils.h" +#include "AclKernelGen.h" namespace onert { @@ -74,15 +75,15 @@ void KernelGenerator::visit(const ir::operation::Abs &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS}; auto fn = std::make_unique<::arm_compute::NEActivationLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + fn->configure(input_tensor->handle(), output_tensor->handle(), act_info); auto acl_fn = asAclFunction(std::move(fn)); @@ -96,10 +97,10 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node) const auto ifm_rank = _ctx.at(ifm_index).shape().rank(); - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); auto frontend_layout = _current_op_seq_layout; - auto backend_layout = ifm_alloc->layout(); + auto backend_layout = ifm_tensor->layout(); int axis_value = node.param().axis; if (axis_value < 0) @@ -112,7 +113,7 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node) auto fn = std::make_unique<::arm_compute::NEArgMinMaxLayer>(); - fn->configure(ifm_alloc->handle(), fixed_axis, ofm_alloc->handle(), + fn->configure(ifm_tensor->handle(), fixed_axis, ofm_tensor->handle(), arm_compute::ReductionOperation::ARG_IDX_MAX); auto acl_fn = asAclFunction(std::move(fn)); @@ -127,15 +128,15 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node) const auto block_size_index{ node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto block_size_alloc = _tensor_builder->at(block_size_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto block_size_tensor = _tensor_builder->at(block_size_index).get(); assert(_ctx.at(block_size_index).data()); auto fn = std::make_unique<::arm_compute::NEBatchToSpaceLayer>(); - fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), ofm_alloc->handle()); + fn->configure(ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -147,15 +148,26 @@ void KernelGenerator::visit(const ir::operation::Cast &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); - auto fn = std::make_unique<::arm_compute::NECast>(); + std::unique_ptr<::arm_compute::IFunction> fn; + if (ifm_tensor->data_type() == ofm_tensor->data_type()) + { + auto l = std::make_unique<::arm_compute::NECopy>(); + + l->configure(ifm_tensor->handle(), ofm_tensor->handle()); + + fn = std::move(l); + } + else + { + auto l = std::make_unique<::arm_compute::NECast>(); - auto input_sub_type = _ctx.at(ifm_index).typeInfo().type() == ir::DataType::BOOL8 - ? arm_compute::SubDataType::BOOL - : arm_compute::SubDataType::NONE; - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), input_sub_type); + l->configure(ifm_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE); + + fn = std::move(l); + } auto acl_fn = asAclFunction(std::move(fn)); @@ -183,10 +195,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) ker_width, ker_height); const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto ker_alloc = _tensor_builder->at(ker_index).get(); - auto bias_alloc = _tensor_builder->at(bias_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto ker_tensor = _tensor_builder->at(ker_index).get(); + auto bias_tensor = _tensor_builder->at(bias_index).get(); const auto conv_info = acl_common::asPadStrideInfo(padding, stride); const auto act_info = acl_common::asActivationLayerInfo(activation); @@ -194,8 +206,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) auto fn = std::make_unique<::arm_compute::NEConvolutionLayer>( _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); - fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(), - conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info); + fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), + ofm_tensor->handle(), conv_info, ::arm_compute::WeightsInfo(), + ::arm_compute::Size2D(1U, 1U), act_info); _return_fn = asAclFunction(std::move(fn)); } @@ -208,12 +221,12 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node) auto block_size = node.param().block_size; assert(block_size > 0); - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); - auto fn = std::make_unique<::arm_compute::NEDepthToSpaceLayerEx>(); + auto fn = std::make_unique<::arm_compute::NEDepthToSpaceLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle(), block_size); + fn->configure(input_tensor->handle(), output_tensor->handle(), block_size); auto acl_fn = asAclFunction(std::move(fn)); @@ -242,10 +255,10 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) const auto multiplier = node.param().multiplier; const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto ker_alloc = _tensor_builder->at(ker_index).get(); - auto bias_alloc = _tensor_builder->at(bias_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto ker_tensor = _tensor_builder->at(ker_index).get(); + auto bias_tensor = _tensor_builder->at(bias_index).get(); const auto conv_info = acl_common::asPadStrideInfo(padding, stride); const auto act_info = acl_common::asActivationLayerInfo(activation); @@ -253,8 +266,8 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) { auto fn = std::make_unique<::arm_compute::NEDepthwiseConvolutionLayer>(); - fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), - ofm_alloc->handle(), conv_info, multiplier, act_info); + fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), + ofm_tensor->handle(), conv_info, multiplier, act_info); _return_fn = asAclFunction(std::move(fn)); } @@ -265,12 +278,12 @@ void KernelGenerator::visit(const ir::operation::Dequantize &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Dequantize::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique<::arm_compute::NEDequantizationLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle()); + fn->configure(input_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -305,19 +318,19 @@ void KernelGenerator::visit(const ir::operation::MaxPool2D &node) VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl; VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); - ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX, - ::arm_compute::Size2D{kw, kh}, - acl_common::asPadStrideInfo(padding, stride)}; + ::arm_compute::PoolingLayerInfo info{ + ::arm_compute::PoolingType::MAX, ::arm_compute::Size2D{kw, kh}, + ifm_tensor->info()->data_layout(), acl_common::asPadStrideInfo(padding, stride)}; auto fn = std::make_unique<::arm_compute::NEPoolingLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info); _return_fn = std::make_unique( - asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::AvgPool2D &node) @@ -348,19 +361,20 @@ void KernelGenerator::visit(const ir::operation::AvgPool2D &node) VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl; VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); ::arm_compute::PoolingLayerInfo info{ ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh}, - acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */}; + ifm_tensor->info()->data_layout(), acl_common::asPadStrideInfo(padding, stride), + true /* exclude_padding */}; auto fn = std::make_unique<::arm_compute::NEPoolingLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info); _return_fn = std::make_unique( - asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::Concat &node) @@ -383,7 +397,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node) return; } - auto output_alloc = _tensor_builder->at(ofm_index).get(); + auto output_tensor = _tensor_builder->at(ofm_index).get(); std::vector<::arm_compute::ITensor *> input_tensors; for (const auto &ifm_ind : input_indexes) input_tensors.emplace_back(_tensor_builder->at(ifm_ind)->handle()); @@ -392,7 +406,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node) if (input_indexes.size() < 2) { auto l = std::make_unique<::arm_compute::NECopy>(); - l->configure(input_tensors.at(0), output_alloc->handle()); + l->configure(input_tensors.at(0), output_tensor->handle()); fn = std::move(l); } else @@ -400,10 +414,10 @@ void KernelGenerator::visit(const ir::operation::Concat &node) auto l = std::make_unique<::arm_compute::NEConcatenateLayer>(); const auto rank = _ctx.at(ofm_index).shape().rank(); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = output_alloc->layout(); + const auto backend_layout = output_tensor->layout(); const auto fixed_axis = acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value(); - l->configure(input_tensors, output_alloc->handle(), fixed_axis); + l->configure(input_tensors, output_tensor->handle(), fixed_axis); fn = std::move(l); } @@ -418,13 +432,13 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node) const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)}; const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto lookups_alloc = _tensor_builder->at(lookups_index).get(); - auto values_alloc = _tensor_builder->at(values_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto lookups_tensor = _tensor_builder->at(lookups_index).get(); + auto values_tensor = _tensor_builder->at(values_index).get(); auto fn = std::make_unique<::arm_compute::NEEmbeddingLookup>(); - fn->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle()); + fn->configure(values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -436,12 +450,12 @@ void KernelGenerator::visit(const ir::operation::Floor &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); auto fn = std::make_unique<::arm_compute::NEFloor>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -450,76 +464,15 @@ void KernelGenerator::visit(const ir::operation::Floor &node) void KernelGenerator::visit(const ir::operation::FullyConnected &node) { - using ir::operation::FullyConnected; - const auto output_index{node.getOutputs().at(0)}; - const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)}; - const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)}; - const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)}; - - const auto input_rank = _ctx.at(input_index).shape().rank(); - - const auto output_size = - _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 1); - UNUSED_RELEASE(output_size); - assert(_ctx.at(bias_index).shape().dim(0) == output_size); - assert(_ctx.at(weight_index).shape().dim(0) == output_size); - const auto batch_size = - _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 2); - const auto input_size = - _ctx.at(weight_index).shape().dim(_ctx.at(weight_index).shape().rank() - 1); - - // Check for reshaping input's shape into rank-2 - bool needs_reshape = false; - ir::Shape reshape(2); - if (input_rank == 3 || input_rank == 4) - { - const auto &ifm_shape = _ctx.at(input_index).shape(); - auto feature_size = 1; - for (int i = 0; i < ifm_shape.rank(); ++i) - { - feature_size *= ifm_shape.dim(i); - } - - UNUSED_RELEASE(feature_size); - assert(feature_size == batch_size * input_size); - - // for reshaping - needs_reshape = true; - reshape.dim(0) = batch_size; /* H */ - reshape.dim(1) = input_size; /* W */ - } - + auto output_tensor = _tensor_builder->at(output_index).get(); const auto activation = node.param().activation; - auto output_alloc = _tensor_builder->at(output_index).get(); - const auto input_alloc = _tensor_builder->at(input_index).get(); - const auto weight_alloc = _tensor_builder->at(weight_index).get(); - const auto bias_alloc = _tensor_builder->at(bias_index).get(); - const auto frontend_layout = _current_op_seq_layout; - const auto acl_layout = output_alloc->handle()->info()->data_layout(); - - auto fn = std::make_unique( - _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); - - arm_compute::NEFullyConnectedReshapingLayer::KernelType kernel_type = - arm_compute::NEFullyConnectedReshapingLayer::KernelType::GENERAL; - if (_ctx.at(weight_index).isConstant()) - { - kernel_type = arm_compute::NEFullyConnectedReshapingLayer::KernelType::PREPROCESSED_WEIGHTS; - assert(_ctx.at(weight_index).data()); - } - - fn->configure( - input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(), output_alloc->handle(), - needs_reshape, - ::onert::backend::acl_common::asTensorShape( - reshape, frontend_layout, ::onert::backend::acl_common::asRuntimeLayout(acl_layout)), - kernel_type); - + auto fn = acl_common::kernelGenFullyConnected( + node, _ctx, _tensor_builder, _current_op_seq_layout); _return_fn = std::make_unique( - asAclFunction(std::move(fn)), - ActivationBuilder::generate(activation, output_alloc->handle())); + std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle())); } void KernelGenerator::visit(const ir::operation::HashtableLookup &node) @@ -531,17 +484,17 @@ void KernelGenerator::visit(const ir::operation::HashtableLookup &node) const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)}; const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto hits_alloc = _tensor_builder->at(hits_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto hits_tensor = _tensor_builder->at(hits_index).get(); - auto lookups_alloc = _tensor_builder->at(lookups_index).get(); - auto keys_alloc = _tensor_builder->at(keys_index).get(); - auto values_alloc = _tensor_builder->at(values_index).get(); + auto lookups_tensor = _tensor_builder->at(lookups_index).get(); + auto keys_tensor = _tensor_builder->at(keys_index).get(); + auto values_tensor = _tensor_builder->at(values_index).get(); auto fn = std::make_unique<::arm_compute::NEHashtableLookup>(); - fn->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(), - output_alloc->handle(), hits_alloc->handle()); + fn->configure(lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(), + output_tensor->handle(), hits_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -561,10 +514,10 @@ void KernelGenerator::visit(const ir::operation::Gather &node) // Converting in reverse order const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value(); - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto indices_alloc = _tensor_builder->at(indices_index).get(); - const auto backend_layout = ofm_alloc->layout(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto indices_tensor = _tensor_builder->at(indices_index).get(); + const auto backend_layout = ofm_tensor->layout(); UNUSED_RELEASE(backend_layout); // NOTE The frontend layout and backend layout must be the same for this operation. @@ -575,35 +528,35 @@ void KernelGenerator::visit(const ir::operation::Gather &node) // a model. For example, if a model in NHWC has this operation as output rank == 4, indices // rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W // and C are not sequential in NCHW. So the backend in NCHW cannot handle this case. - assert(backend_layout == ifm_alloc->layout()); - assert(backend_layout == indices_alloc->layout()); + assert(backend_layout == ifm_tensor->layout()); + assert(backend_layout == indices_tensor->layout()); assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout); auto fn = std::make_unique<::arm_compute::NEGatherEx>(); // input is n-D, indices k-D, output is (n + k - 1)-D size_t n = ifm_rank; - assert(n == ifm_alloc->num_dimensions()); + assert(n == ifm_tensor->num_dimensions()); size_t k = _ctx.at(indices_index).shape().rank(); - assert(k == indices_alloc->num_dimensions()); + assert(k == indices_tensor->num_dimensions()); // Disable applied dim_correction - if (n != ifm_alloc->info()->num_dimensions()) + if (n != ifm_tensor->info()->num_dimensions()) { // This means that high dimension's value is 1 and ifm tensor is applied dim_correction const auto ifm = _ctx.at(ifm_index); - ifm_alloc->info()->set_tensor_shape( + ifm_tensor->info()->set_tensor_shape( acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false)); } - if (k != indices_alloc->info()->num_dimensions()) + if (k != indices_tensor->info()->num_dimensions()) { // This means that high dimension's value is 1 and indices tensor is applied dim_correction const auto indices = _ctx.at(indices_index); - indices_alloc->info()->set_tensor_shape( + indices_tensor->info()->set_tensor_shape( acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false)); } - fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis); + fn->configure(ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis); // acl_neon doesn't not revert disabling applied dim_correction because acl_neon's kernels would // use arm_compute::TensorInfo::offset_element_in_bytes() @@ -621,20 +574,20 @@ void KernelGenerator::visit(const ir::operation::InstanceNorm &node) const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)}; const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto gamma_alloc = _tensor_builder->at(gamma_index).get(); - auto beta_alloc = _tensor_builder->at(beta_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto gamma_tensor = _tensor_builder->at(gamma_index).get(); + auto beta_tensor = _tensor_builder->at(beta_index).get(); auto epsilon = node.param().epsilon; auto activation = node.param().activation; auto fn = std::make_unique<::arm_compute::NEInstanceNormalizationLayerEx>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), gamma_alloc->handle(), - beta_alloc->handle(), epsilon); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), + beta_tensor->handle(), epsilon); _return_fn = std::make_unique( - asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::L2Normalization &node) @@ -656,15 +609,15 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node) float beta = 0.5f; // pow(reduction, -0.5) = 1 / sqrt(reduction) float bias = 0.0f; // Don't offset the reduction. - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP, radius, alpha, beta, bias, false); auto fn = std::make_unique<::arm_compute::NENormalizationLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info); auto acl_fn = asAclFunction(std::move(fn)); @@ -686,19 +639,20 @@ void KernelGenerator::visit(const ir::operation::L2Pool2D &node) ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); ::arm_compute::PoolingLayerInfo info{ ::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh}, + ifm_tensor->info()->data_layout(), ::onert::backend::acl_common::asPadStrideInfo(padding, stride)}; auto fn = std::make_unique<::arm_compute::NEPoolingLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info); _return_fn = std::make_unique( - asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node) @@ -712,15 +666,15 @@ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &nod auto beta = node.param().beta; auto bias = node.param().bias; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); const auto norm_info = ::arm_compute::NormalizationLayerInfo( ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false); auto fn = std::make_unique<::arm_compute::NENormalizationLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info); auto acl_fn = asAclFunction(std::move(fn)); @@ -733,13 +687,13 @@ void KernelGenerator::visit(const ir::operation::LogicalAnd &node) const auto input0_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT0)}; const auto input1_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT1)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input0_alloc = _tensor_builder->at(input0_index).get(); - auto input1_alloc = _tensor_builder->at(input1_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input0_tensor = _tensor_builder->at(input0_index).get(); + auto input1_tensor = _tensor_builder->at(input1_index).get(); auto fn = std::make_unique<::arm_compute::NELogicalAnd>(); - fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle()); + fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -751,12 +705,12 @@ void KernelGenerator::visit(const ir::operation::LogicalNot &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::LogicalNot::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique<::arm_compute::NEBitwiseNot>(); - fn->configure(input_alloc->handle(), output_alloc->handle()); + fn->configure(input_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -769,13 +723,13 @@ void KernelGenerator::visit(const ir::operation::LogicalOr &node) const auto input0_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT0)}; const auto input1_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT1)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input0_alloc = _tensor_builder->at(input0_index).get(); - auto input1_alloc = _tensor_builder->at(input1_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input0_tensor = _tensor_builder->at(input0_index).get(); + auto input1_tensor = _tensor_builder->at(input1_index).get(); auto fn = std::make_unique<::arm_compute::NELogicalOr>(); - fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle()); + fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -787,8 +741,8 @@ void KernelGenerator::visit(const ir::operation::Logistic &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC}; @@ -798,7 +752,7 @@ void KernelGenerator::visit(const ir::operation::Logistic &node) // instead of 'INF', and then the result of this op will be errors due to the 'NaN'. auto fn = std::make_unique<::arm_compute::NEActivationLayerEx>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info); auto acl_fn = asAclFunction(std::move(fn)); @@ -807,159 +761,8 @@ void KernelGenerator::visit(const ir::operation::Logistic &node) void KernelGenerator::visit(const ir::operation::LSTM &node) { - // TODO Support dynamic rnn - // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection. - const auto scratch_buffer_index{ - node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)}; - const auto output_state_out_index{ - node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)}; - const auto cell_state_out_index{ - node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)}; - const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)}; - - const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)}; - const auto input_to_input_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional - const auto input_to_forget_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)}; - const auto input_to_cell_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)}; - const auto input_to_output_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)}; - const auto recurrent_to_input_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional - const auto recurrent_to_forget_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)}; - const auto recurrent_to_cell_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)}; - const auto recurrent_to_output_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)}; - const auto cell_to_input_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional - const auto cell_to_forget_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional - const auto cell_to_output_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional - const auto input_gate_bias_index{ - node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)}; - const auto forget_gate_bias_index{ - node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)}; - const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)}; - const auto output_gate_bias_index{ - node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)}; - const auto projection_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional - const auto projection_bias_index{ - node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional - const auto output_state_in_index{ - node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)}; - const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)}; - const auto cell_threshold = node.param().cell_threshold; - const auto projection_threshold = node.param().projection_threshold; - - bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 && - _ctx.at(input_to_input_weights_index).shape().dim(1) != 0; - bool has_recurrent_to_input_weights = - _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 && - _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0; - bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0; - bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0; - bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 && - _ctx.at(projection_weights_index).shape().dim(1) != 0; - bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0); - - // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG. - // true: no CIFG - // false: CIFG - // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG). - bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights; - - // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole. - // But the cell_to_input_weights does not exist in regular CIFG although peephole. - // true: peephole - // false: no peephole - bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights; - - // NOTE Although the projection weights has data the projection bias may not have data. - bool has_projection_param = has_projection_weights; - - const auto activation = node.param().activation; - const auto cell_clip = cell_threshold; - const auto projection_clip = projection_threshold; - assert(cell_clip >= 0.f && projection_clip >= 0.f); - - auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get(); - auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get(); - auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get(); - auto output_alloc = _tensor_builder->at(output_index).get(); - - auto input_alloc = _tensor_builder->at(input_index).get(); - - auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get(); - auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get(); - auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get(); - auto recurrent_to_forget_weights_alloc = - _tensor_builder->at(recurrent_to_forget_weights_index).get(); - auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get(); - auto recurrent_to_output_weights_alloc = - _tensor_builder->at(recurrent_to_output_weights_index).get(); - - auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get(); - auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get(); - auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get(); - auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get(); - auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get(); - - auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation); - - auto fn = std::make_unique<::arm_compute::NELSTMLayer>(); - - ::arm_compute::LSTMParams<::arm_compute::ITensor> lstm_params{}; - if (has_cifg_param) - { - auto input_to_input_weights_alloc = - _tensor_builder->at(input_to_input_weights_index).get(); // optional - auto recurrent_to_input_weights_alloc = - _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional - auto cell_to_input_weights_handle = - has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle() - : nullptr; // optional (non-cifg && peephole) - auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional - lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(), - recurrent_to_input_weights_alloc->handle(), - cell_to_input_weights_handle, input_gate_bias_alloc->handle()); - } - if (has_peephole_param) - { - auto cell_to_forget_weights_alloc = - _tensor_builder->at(cell_to_forget_weights_index).get(); // optional - auto cell_to_output_weights_alloc = - _tensor_builder->at(cell_to_output_weights_index).get(); // optional - lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(), - cell_to_output_weights_alloc->handle()); - } - if (has_projection_param) - { - auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional - auto projection_bias_handle = has_projection_bias - ? _tensor_builder->at(projection_bias_index).get()->handle() - : nullptr; // optional - lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle); - } - - fn->configure( - input_alloc->handle(), input_to_forget_weights_alloc->handle(), - input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(), - recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(), - recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(), - cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(), - cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(), - output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(), - lstm_params, act_info, cell_clip, projection_clip); - - auto acl_fn = asAclFunction(std::move(fn)); - - _return_fn = std::move(acl_fn); + _return_fn = acl_common::kernelGenLSTM(node, _ctx, _tensor_builder); } void KernelGenerator::visit(const ir::operation::Mul &node) @@ -970,18 +773,18 @@ void KernelGenerator::visit(const ir::operation::Mul &node) const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::NEPixelWiseMultiplication>(); // RoundingPolicy for scale:1.0 is only allowed RoundingPolicy::TO_ZERO - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO); _return_fn = std::make_unique( - asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::Neg &node) @@ -989,12 +792,12 @@ void KernelGenerator::visit(const ir::operation::Neg &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); auto fn = std::make_unique<::arm_compute::NENegLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -1030,12 +833,12 @@ void KernelGenerator::visit(const ir::operation::Pack &node) for (const auto &input_index : input_indexes) { size_t input_rank = _ctx.at(input_index).shape().rank(); - const auto &input_alloc = _tensor_builder->at(input_index); - assert(input_rank == input_alloc->num_dimensions()); - if (input_rank != input_alloc->info()->num_dimensions()) + const auto &input_tensor = _tensor_builder->at(input_index); + assert(input_rank == input_tensor->num_dimensions()); + if (input_rank != input_tensor->info()->num_dimensions()) { // This means that high dimension's value is 1 and ifm tensor is applied dim_correction - input_alloc->info()->set_tensor_shape(acl_common::asTensorShape( + input_tensor->info()->set_tensor_shape(acl_common::asTensorShape( _ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false)); } } @@ -1094,8 +897,8 @@ void KernelGenerator::visit(const ir::operation::Permute &node) const auto ofm_idx{node.getOutputs().at(0)}; const auto ifm_idx{node.getInputs().at(0)}; const auto permute_type = node.getPermuteType(); - auto ofm_alloc = _tensor_builder->at(ofm_idx).get(); - auto ifm_alloc = _tensor_builder->at(ifm_idx).get(); + auto ofm_tensor = _tensor_builder->at(ofm_idx).get(); + auto ifm_tensor = _tensor_builder->at(ifm_idx).get(); const auto rank = _ctx.at(ofm_idx).shape().rank(); assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank()); @@ -1108,7 +911,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node) auto l = std::make_unique<::arm_compute::NEPermute>(); - l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv); + l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv); fn = std::move(l); } @@ -1119,7 +922,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node) auto l = std::make_unique<::arm_compute::NEPermute>(); - l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv); + l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv); fn = std::move(l); } @@ -1127,7 +930,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node) { auto l = std::make_unique<::arm_compute::NECopy>(); - l->configure(ifm_alloc->handle(), ofm_alloc->handle()); + l->configure(ifm_tensor->handle(), ofm_tensor->handle()); fn = std::move(l); } @@ -1143,15 +946,15 @@ void KernelGenerator::visit(const ir::operation::PReLU &node) const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)}; const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto alpha_alloc = _tensor_builder->at(alpha_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto alpha_tensor = _tensor_builder->at(alpha_index).get(); std::unique_ptr<::arm_compute::IFunction> fn; - auto l = std::make_unique<::arm_compute::NEPReLU>(); + auto l = std::make_unique<::arm_compute::NEPReluLayer>(); - l->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle()); + l->configure(ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle()); fn = std::move(l); @@ -1166,14 +969,14 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)}; const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); // Convert to ACL axes taking into account negative values and possible duplicates. const auto &axes = _ctx.at(axes_index); const auto input_rank = _ctx.at(input_index).shape().rank(); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = input_alloc->layout(); + const auto backend_layout = input_tensor->layout(); const auto reduce_axes = acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout); const auto reduce_type = node.param().reduce_type; @@ -1182,11 +985,9 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) std::unique_ptr<::arm_compute::IFunction> fn; if (reduce_type == ir::operation::Reduce::ReduceType::MEAN) { - // NOTE NEReduceMean has a bug that does not support NHWC layout - // NEReduceMean intermediate tensors are always NCHW layout - auto l = std::make_unique<::arm_compute::NEReduceMeanEx>(); + auto l = std::make_unique<::arm_compute::NEReduceMean>(); - l->configure(input_alloc->handle(), reduce_axes, keep_dims, output_alloc->handle()); + l->configure(input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle()); fn = std::move(l); } @@ -1194,7 +995,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) { auto l = std::make_unique<::arm_compute::NEReduceSum>(); - l->configure(input_alloc->handle(), reduce_axes, keep_dims, output_alloc->handle()); + l->configure(input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle()); fn = std::move(l); } @@ -1202,7 +1003,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) { auto l = std::make_unique<::arm_compute::NEReduceOperation>(); - l->configure(input_alloc->handle(), reduce_axes, keep_dims, output_alloc->handle(), + l->configure(input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle(), acl_common::convertReduceType(reduce_type)); fn = std::move(l); @@ -1218,15 +1019,15 @@ void KernelGenerator::visit(const ir::operation::ReLU &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::ReLU::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU}; - fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + fn->configure(input_tensor->handle(), output_tensor->handle(), act_info); auto acl_fn = asAclFunction(std::move(fn)); @@ -1238,15 +1039,15 @@ void KernelGenerator::visit(const ir::operation::ReLU1 &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::ReLU1::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f}; auto fn = std::make_unique<::arm_compute::NEActivationLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info); auto acl_fn = asAclFunction(std::move(fn)); @@ -1258,15 +1059,15 @@ void KernelGenerator::visit(const ir::operation::ReLU6 &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::ReLU6::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f}; auto fn = std::make_unique<::arm_compute::NEActivationLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info); auto acl_fn = asAclFunction(std::move(fn)); @@ -1278,13 +1079,13 @@ void KernelGenerator::visit(const ir::operation::Reshape &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); // NOTE This operation must not be changed the layout from frontend to backend // So, PermutationOperationPass makes layouts of frontend and backend the same. const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = output_alloc->layout(); + const auto backend_layout = output_tensor->layout(); assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) || frontend_layout == backend_layout); UNUSED_RELEASE(frontend_layout); @@ -1292,7 +1093,7 @@ void KernelGenerator::visit(const ir::operation::Reshape &node) auto fn = std::make_unique(); - fn->configure(input_alloc->handle(), output_alloc->handle()); + fn->configure(input_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -1305,12 +1106,12 @@ void KernelGenerator::visit(const ir::operation::ResizeBilinear &node) const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); auto fn = std::make_unique<::arm_compute::NEScale>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE, ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT); @@ -1334,25 +1135,25 @@ void KernelGenerator::visit(const ir::operation::RNN &node) const auto activation = node.param().activation; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto hidden_state_out_tensor = _tensor_builder->at(hidden_state_out_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); - auto weights_alloc = _tensor_builder->at(weights_index).get(); - auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get(); - auto bias_alloc = _tensor_builder->at(bias_index).get(); - auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); + auto weights_tensor = _tensor_builder->at(weights_index).get(); + auto recurrent_weights_tensor = _tensor_builder->at(recurrent_weights_index).get(); + auto bias_tensor = _tensor_builder->at(bias_index).get(); + auto hidden_state_in_tensor = _tensor_builder->at(hidden_state_in_index).get(); auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation); auto copy_layer = std::make_unique<::arm_compute::NECopy>(); - copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle()); + copy_layer->configure(hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle()); _return_fn = asAclFunction(std::move(copy_layer)); - auto fn = std::make_unique<::arm_compute::NERNNLayerEx>( + auto fn = std::make_unique<::arm_compute::NERNNLayer>( _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); - fn->configure(input_alloc->handle(), weights_alloc->handle(), recurrent_weights_alloc->handle(), - bias_alloc->handle(), hidden_state_out_alloc->handle(), output_alloc->handle(), - act_info); + fn->configure(input_tensor->handle(), weights_tensor->handle(), + recurrent_weights_tensor->handle(), bias_tensor->handle(), + hidden_state_out_tensor->handle(), output_tensor->handle(), act_info); _return_fn = asAclFunction(std::move(fn)); } @@ -1361,12 +1162,12 @@ void KernelGenerator::visit(const ir::operation::RSQRT &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); auto fn = std::make_unique<::arm_compute::NERsqrtLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle()); _return_fn = asAclFunction(std::move(fn)); } @@ -1383,10 +1184,10 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node) (void)dims; (void)ndim; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique(); - fn->configure(input_alloc->handle(), output_alloc->handle()); + fn->configure(input_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); _return_fn = std::move(acl_fn); } @@ -1396,15 +1197,15 @@ void KernelGenerator::visit(const ir::operation::Tanh &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f}; - fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + fn->configure(input_tensor->handle(), output_tensor->handle(), act_info); auto acl_fn = asAclFunction(std::move(fn)); @@ -1417,13 +1218,25 @@ void KernelGenerator::visit(const ir::operation::Softmax &node) const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)}; const auto beta = node.param().beta; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); + const auto frontend_layout = _current_op_seq_layout; + const auto backend_layout = input_tensor->layout(); + + // Disable applied dim_correction + const size_t input_rank = _ctx.at(input_index).shape().rank(); + if (input_rank != input_tensor->info()->num_dimensions()) + { + // This means that high dimension's value is 1 and input tensor is applied dim_correction + const auto input = _ctx.at(input_index); + input_tensor->info()->set_tensor_shape( + acl_common::asTensorShape(input.shape(), frontend_layout, backend_layout, false)); + } auto fn = std::make_unique<::arm_compute::NESoftmaxLayer>( _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); - fn->configure(input_alloc->handle(), output_alloc->handle(), beta); + fn->configure(input_tensor->handle(), output_tensor->handle(), beta); auto acl_fn = asAclFunction(std::move(fn)); @@ -1438,20 +1251,18 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node) node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)}; const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto block_size_alloc = _tensor_builder->at(block_size_index).get(); - auto paddings_alloc = _tensor_builder->at(paddings_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto block_size_tensor = _tensor_builder->at(block_size_index).get(); + auto paddings_tensor = _tensor_builder->at(paddings_index).get(); assert(_ctx.at(block_size_index).data()); assert(_ctx.at(paddings_index).data()); - // NESpaceToBatchLayer has a bug that padding's values are 0 even when zero point of QASYMM8 is - // not 0. - auto fn = std::make_unique<::arm_compute::NESpaceToBatchLayerEx>(); + auto fn = std::make_unique<::arm_compute::NESpaceToBatchLayer>(); - fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(), - ofm_alloc->handle()); + fn->configure(ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(), + ofm_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -1465,12 +1276,12 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node) auto block_size = node.param().block_size; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); - auto fn = std::make_unique<::arm_compute::NESpaceToDepthLayerEx>(); + auto fn = std::make_unique<::arm_compute::NESpaceToDepthLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), block_size); auto acl_fn = asAclFunction(std::move(fn)); @@ -1489,13 +1300,13 @@ void KernelGenerator::visit(const ir::operation::Split &node) for (const auto &output : node.getOutputs()) output_indexes.emplace_back(output); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - std::vector output_allocs; + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + std::vector output_tensors; for (const auto &ofm_ind : output_indexes) - output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle()); + output_tensors.emplace_back(_tensor_builder->at(ofm_ind).get()->handle()); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = ifm_alloc->layout(); + const auto backend_layout = ifm_tensor->layout(); auto axis = node.param().axis; if (axis < 0) axis += ifm_rank; @@ -1503,7 +1314,7 @@ void KernelGenerator::visit(const ir::operation::Split &node) auto fn = std::make_unique<::arm_compute::NESplit>(); - fn->configure(ifm_alloc->handle(), output_allocs, axis); + fn->configure(ifm_tensor->handle(), output_tensors, axis); _return_fn = asAclFunction(std::move(fn)); } @@ -1513,15 +1324,15 @@ void KernelGenerator::visit(const ir::operation::SQRT &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::SQRT::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT}; auto fn = std::make_unique<::arm_compute::NEActivationLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + fn->configure(input_tensor->handle(), output_tensor->handle(), act_info); auto acl_fn = asAclFunction(std::move(fn)); @@ -1534,13 +1345,13 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node) const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::NEElementwiseSquaredDiff>(); - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -1555,17 +1366,17 @@ void KernelGenerator::visit(const ir::operation::Sub &node) const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::NEArithmeticSubtraction>(); - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE); _return_fn = std::make_unique( - asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::Slice &node) @@ -1575,10 +1386,10 @@ void KernelGenerator::visit(const ir::operation::Slice &node) const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)}; const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)}; - auto outputData_alloc = _tensor_builder->at(output_index).get(); - auto inputData_alloc = _tensor_builder->at(input_index).get(); + auto outputData_tensor = _tensor_builder->at(output_index).get(); + auto inputData_tensor = _tensor_builder->at(input_index).get(); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = inputData_alloc->layout(); + const auto backend_layout = inputData_tensor->layout(); // Set initializers for indices data such as order of inputData int input_rank = _ctx.at(input_index).shape().rank(); @@ -1628,7 +1439,7 @@ void KernelGenerator::visit(const ir::operation::Slice &node) auto fn = std::make_unique<::arm_compute::NESlice>(); - fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set); + fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set); auto acl_fn = asAclFunction(std::move(fn)); @@ -1643,10 +1454,10 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node) const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)}; const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)}; - auto outputData_alloc = _tensor_builder->at(output_index).get(); - auto inputData_alloc = _tensor_builder->at(input_index).get(); + auto outputData_tensor = _tensor_builder->at(output_index).get(); + auto inputData_tensor = _tensor_builder->at(input_index).get(); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = inputData_alloc->layout(); + const auto backend_layout = inputData_tensor->layout(); // Set initializers for indices data such as order of inputData int input_rank = _ctx.at(input_index).shape().rank(); @@ -1715,7 +1526,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node) auto fn = std::make_unique<::arm_compute::NEStridedSlice>(); - fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set, + fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set, begin_mask, end_mask, shrink_axis_mask); auto acl_fn = asAclFunction(std::move(fn)); @@ -1749,16 +1560,16 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node) invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1); } - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto ker_alloc = _tensor_builder->at(ker_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto ker_tensor = _tensor_builder->at(ker_index).get(); const auto tconv_info = acl_common::asPadStrideInfo(padding, stride); auto fn = std::make_unique<::arm_compute::NETransposeConvLayer>(); - fn->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info, - invalid_horizontal, invalid_vertical); + fn->configure(ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(), + tconv_info, invalid_horizontal, invalid_vertical); auto acl_fn = asAclFunction(std::move(fn)); @@ -1771,10 +1582,10 @@ void KernelGenerator::visit(const ir::operation::Transpose &node) const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)}; const auto &perm{node.param().perm}; - auto ofm_alloc = _tensor_builder->at(ofm_idx).get(); - const auto ifm_alloc = _tensor_builder->at(ifm_idx).get(); + auto ofm_tensor = _tensor_builder->at(ofm_idx).get(); + const auto ifm_tensor = _tensor_builder->at(ifm_idx).get(); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = ifm_alloc->layout(); + const auto backend_layout = ifm_tensor->layout(); const auto rank = _ctx.at(ifm_idx).shape().rank(); std::vector pv(perm.cbegin(), perm.cend()); @@ -1783,11 +1594,11 @@ void KernelGenerator::visit(const ir::operation::Transpose &node) std::unique_ptr<::arm_compute::IFunction> fn; - if (ifm_alloc->num_dimensions() <= 2 && ofm_alloc->num_dimensions() <= 2) + if (ifm_tensor->num_dimensions() <= 2 && ofm_tensor->num_dimensions() <= 2) { auto l = std::make_unique<::arm_compute::NETranspose>(); - l->configure(ifm_alloc->handle(), ofm_alloc->handle()); + l->configure(ifm_tensor->handle(), ofm_tensor->handle()); fn = std::move(l); } @@ -1795,7 +1606,7 @@ void KernelGenerator::visit(const ir::operation::Transpose &node) { auto l = std::make_unique<::arm_compute::NEPermute>(); - l->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv); + l->configure(ifm_tensor->handle(), ofm_tensor->handle(), backend_pv); fn = std::move(l); } @@ -1834,13 +1645,13 @@ void KernelGenerator::visit(const ir::operation::Unpack &node) for (const auto &output_index : output_indexes) { size_t output_rank = _ctx.at(output_index).shape().rank(); - const auto &output_alloc = _tensor_builder->at(output_index); - orig_outputs_acl_tensor_shapes.emplace_back(output_alloc->info()->tensor_shape()); - assert(output_rank == output_alloc->num_dimensions()); - if (output_rank != output_alloc->info()->num_dimensions()) + const auto &output_tensor = _tensor_builder->at(output_index); + orig_outputs_acl_tensor_shapes.emplace_back(output_tensor->info()->tensor_shape()); + assert(output_rank == output_tensor->num_dimensions()); + if (output_rank != output_tensor->info()->num_dimensions()) { // This means that high dimension's value is 1 and ifm tensor is applied dim_correction - output_alloc->info()->set_tensor_shape(acl_common::asTensorShape( + output_tensor->info()->set_tensor_shape(acl_common::asTensorShape( _ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false)); } } @@ -1858,17 +1669,17 @@ void KernelGenerator::visit(const ir::operation::Add &node) const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::NEArithmeticAddition>(); - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE); _return_fn = std::make_unique( - asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::Div &node) @@ -1879,16 +1690,16 @@ void KernelGenerator::visit(const ir::operation::Div &node) const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::NEElementwiseDivision>(); - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); _return_fn = std::make_unique( - asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::Exp &node) @@ -1896,12 +1707,12 @@ void KernelGenerator::visit(const ir::operation::Exp &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique<::arm_compute::NEExpLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle()); + fn->configure(input_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -1913,12 +1724,12 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique<::arm_compute::NEReshapeLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle()); + fn->configure(input_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -1933,13 +1744,13 @@ void KernelGenerator::visit(const ir::operation::Comparison &node) const auto comparison_type = node.param().comparison_type; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input0_alloc = _tensor_builder->at(input0_index).get(); - auto input1_alloc = _tensor_builder->at(input1_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input0_tensor = _tensor_builder->at(input0_index).get(); + auto input1_tensor = _tensor_builder->at(input1_index).get(); auto fn = std::make_unique<::arm_compute::NEElementwiseComparison>(); - fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(), + fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(), (arm_compute::ComparisonOperation)comparison_type); auto acl_fn = asAclFunction(std::move(fn)); @@ -1953,13 +1764,13 @@ void KernelGenerator::visit(const ir::operation::Min &node) const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::NEElementwiseMin>(); - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -1972,13 +1783,13 @@ void KernelGenerator::visit(const ir::operation::Max &node) const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::NEElementwiseMax>(); - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); diff --git a/runtime/onert/backend/cpu/ConstantInitializer.cc b/runtime/onert/backend/cpu/ConstantInitializer.cc index 71e3136..deb27f0 100644 --- a/runtime/onert/backend/cpu/ConstantInitializer.cc +++ b/runtime/onert/backend/cpu/ConstantInitializer.cc @@ -15,6 +15,7 @@ */ #include "ConstantInitializer.h" +#include "Tensor.h" namespace onert { @@ -30,39 +31,61 @@ ConstantInitializer::ConstantInitializer(const ir::Operands &operands, // DO NOTHING } +void ConstantInitializer::registerDefaultInitializer(const ir::OperandIndex &index, + const ir::Operand &obj) +{ + registerExternalInitializer(index, obj); +} + +void ConstantInitializer::registerExternalInitializer(const ir::OperandIndex &index, + const ir::Operand &obj) +{ + // For only CONSTANTS + // TODO Add to check if tensor has been allocated + if (!obj.isConstant()) + return; + + _init_map[index] = [](const onert::ir::Operand &model_obj, onert::backend::ITensor &itensor) { + auto data = model_obj.shareData(); + assert(data && data->base()); + ExternalTensor &tensor = dynamic_cast(itensor); + tensor.setData(data); + }; +} + void ConstantInitializer::visit(const ir::operation::Conv2D &node) { const auto &kernel_index = node.getInputs().at(ir::operation::Conv2D::KERNEL); const auto &kernel_obj = _operands.at(kernel_index); - registerCopyInitializer(kernel_index, kernel_obj); + registerExternalInitializer(kernel_index, kernel_obj); const auto &bias_index = node.getInputs().at(ir::operation::Conv2D::BIAS); const auto &bias_obj = _operands.at(bias_index); - registerCopyInitializer(bias_index, bias_obj); + registerExternalInitializer(bias_index, bias_obj); } void ConstantInitializer::visit(const ir::operation::DepthwiseConv2D &node) { const auto &kernel_index = node.getInputs().at(ir::operation::DepthwiseConv2D::KERNEL); const auto &kernel_obj = _operands.at(kernel_index); - registerCopyInitializer(kernel_index, kernel_obj); + registerExternalInitializer(kernel_index, kernel_obj); const auto &bias_index = node.getInputs().at(ir::operation::DepthwiseConv2D::BIAS); const auto &bias_obj = _operands.at(bias_index); - registerCopyInitializer(bias_index, bias_obj); + registerExternalInitializer(bias_index, bias_obj); } void ConstantInitializer::visit(const ir::operation::FullyConnected &node) { const auto &weight_index = node.getInputs().at(ir::operation::FullyConnected::WEIGHT); const auto &weight_obj = _operands.at(weight_index); - registerCopyInitializer(weight_index, weight_obj); + registerExternalInitializer(weight_index, weight_obj); const auto &bias_index = node.getInputs().at(ir::operation::FullyConnected::BIAS); if (!bias_index.undefined()) { const auto &bias_obj = _operands.at(bias_index); - registerCopyInitializer(bias_index, bias_obj); + registerExternalInitializer(bias_index, bias_obj); } } diff --git a/runtime/onert/backend/cpu/ConstantInitializer.h b/runtime/onert/backend/cpu/ConstantInitializer.h index bd06c64..de03a69 100644 --- a/runtime/onert/backend/cpu/ConstantInitializer.h +++ b/runtime/onert/backend/cpu/ConstantInitializer.h @@ -36,6 +36,15 @@ public: const std::shared_ptr &tensor_builder); public: + void registerDefaultInitializer(const ir::OperandIndex &index, const ir::Operand &obj) override; + + // TODO: For now the only cpu backend supports constant tensor to use data from external + // If the other backend supports (to do this, + // ExternalTensor should be abstract such as IExternal, maybe), + // this can be an interface of IConstantInitializer + void registerExternalInitializer(const ir::OperandIndex &, const ir::Operand &); + +public: void visit(const ir::operation::Conv2D &) override; void visit(const ir::operation::DepthwiseConv2D &) override; void visit(const ir::operation::FullyConnected &) override; diff --git a/runtime/onert/backend/cpu/KernelGenerator.cc b/runtime/onert/backend/cpu/KernelGenerator.cc index 72f9606..2766aa2 100644 --- a/runtime/onert/backend/cpu/KernelGenerator.cc +++ b/runtime/onert/backend/cpu/KernelGenerator.cc @@ -60,6 +60,7 @@ #include "ops/SoftMaxLayer.h" #include "ops/StridedSliceLayer.h" #include "ops/SpaceToBatchNDLayer.h" +#include "ops/SpaceToDepthLayer.h" #include "ops/SplitLayer.h" #include "ops/SubLayer.h" #include "ops/TanhLayer.h" @@ -70,11 +71,13 @@ #include "ops/ZerosLikeLayer.h" #include "ops/SquaredDiffLayer.h" #include "ops/LogicalOrLayer.h" +#include "ops/L2NormLayer.h" #include "ops/MatrixBandPartLayer.h" #include "ops/BatchMatMulLayer.h" #include "ops/BroadcastToLayer.h" #include "ops/FusedBatchNormLayer.h" #include "ops/LogSoftMaxLayer.h" +#include "ops/QuantizeLayer.h" #include #include @@ -184,10 +187,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)}; const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); - auto ker_alloc = _tensor_builder->portableAt(ker_index).get(); - auto bias_alloc = _tensor_builder->portableAt(bias_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); + auto ker_tensor = _tensor_builder->portableAt(ker_index).get(); + auto bias_tensor = _tensor_builder->portableAt(bias_index).get(); const auto stride = node.param().stride; const auto activation = node.param().activation; @@ -196,9 +199,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) if (_ctx.at(ifm_index).info().isDynamic() || _ctx.at(ker_index).info().isDynamic()) { - fn->configure(ifm_alloc, ker_alloc, bias_alloc, param_padding.type, param_padding.param.left, + fn->configure(ifm_tensor, ker_tensor, bias_tensor, param_padding.type, param_padding.param.left, param_padding.param.right, param_padding.param.top, param_padding.param.bottom, - stride.horizontal, stride.vertical, activation, ofm_alloc); + stride.horizontal, stride.vertical, activation, ofm_tensor); _return_fn = std::move(fn); return; @@ -213,9 +216,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) const auto padding = ir::calculatePadding(param_padding, ifm_shape, ofm_shape, stride, ker_width, ker_height); - fn->configure(ifm_alloc, ker_alloc, bias_alloc, param_padding.type, padding.left, padding.right, - padding.top, padding.bottom, stride.horizontal, stride.vertical, activation, - ofm_alloc); + fn->configure(ifm_tensor, ker_tensor, bias_tensor, param_padding.type, padding.left, + padding.right, padding.top, padding.bottom, stride.horizontal, stride.vertical, + activation, ofm_tensor); _return_fn = std::move(fn); } @@ -241,16 +244,16 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) const auto multiplier = node.param().multiplier; const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); - auto ker_alloc = _tensor_builder->portableAt(ker_index).get(); - auto bias_alloc = _tensor_builder->portableAt(bias_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); + auto ker_tensor = _tensor_builder->portableAt(ker_index).get(); + auto bias_tensor = _tensor_builder->portableAt(bias_index).get(); auto fn = std::make_unique(); - fn->configure(ifm_alloc, ker_alloc, bias_alloc, padding.left, padding.right, padding.top, + fn->configure(ifm_tensor, ker_tensor, bias_tensor, padding.left, padding.right, padding.top, padding.bottom, stride.horizontal, stride.vertical, multiplier, activation, - ofm_alloc); + ofm_tensor); _return_fn = std::move(fn); } @@ -270,13 +273,13 @@ void KernelGenerator::visit(const ir::operation::MaxPool2D &node) ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); auto fn = std::make_unique(); - fn->configure(ifm_alloc, padding.left, padding.right, padding.top, padding.bottom, - stride.horizontal, stride.vertical, kw, kh, activation, ofm_alloc); + fn->configure(ifm_tensor, padding.left, padding.right, padding.top, padding.bottom, + stride.horizontal, stride.vertical, kw, kh, activation, ofm_tensor); _return_fn = std::move(fn); } @@ -295,13 +298,13 @@ void KernelGenerator::visit(const ir::operation::AvgPool2D &node) ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); auto fn = std::make_unique(); - fn->configure(ifm_alloc, padding.left, padding.right, padding.top, padding.bottom, - stride.horizontal, stride.vertical, kw, kh, activation, ofm_alloc); + fn->configure(ifm_tensor, padding.left, padding.right, padding.top, padding.bottom, + stride.horizontal, stride.vertical, kw, kh, activation, ofm_tensor); _return_fn = std::move(fn); } @@ -313,7 +316,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node) const auto rank = _ctx.at(ofm_index).shape().rank(); const auto axis = ops::getAxis(rank, node.param().axis, _current_op_seq_layout); - auto output_alloc = _tensor_builder->portableAt(ofm_index).get(); + auto output_tensor = _tensor_builder->portableAt(ofm_index).get(); std::vector input_tensors; for (auto &ifm_idx : node.getInputs()) @@ -321,7 +324,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node) auto fn = std::make_unique(); - fn->configure(input_tensors, axis, output_alloc); + fn->configure(input_tensors, axis, output_tensor); _return_fn = std::move(fn); } @@ -332,13 +335,13 @@ void KernelGenerator::visit(const ir::operation::Fill &node) const auto input_index{node.getInputs().at(ir::operation::Fill::Input::INPUT)}; const auto value_index{node.getInputs().at(ir::operation::Fill::Input::VALUE)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); - auto value_alloc = _tensor_builder->portableAt(value_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto value_tensor = _tensor_builder->portableAt(value_index).get(); auto fn = std::make_unique(); - fn->configure(input_alloc, value_alloc, output_alloc); + fn->configure(input_tensor, value_tensor, output_tensor); _return_fn = std::move(fn); } @@ -353,15 +356,15 @@ void KernelGenerator::visit(const ir::operation::FullyConnected &node) const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)}; const auto activation = node.param().activation; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); - auto weight_alloc = _tensor_builder->portableAt(weight_index).get(); - auto bias_alloc = + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto weight_tensor = _tensor_builder->portableAt(weight_index).get(); + auto bias_tensor = bias_index.undefined() ? nullptr : _tensor_builder->portableAt(bias_index).get(); auto fn = std::make_unique(); - fn->configure(input_alloc, weight_alloc, bias_alloc, activation, output_alloc); + fn->configure(input_tensor, weight_tensor, bias_tensor, activation, output_tensor); _return_fn = std::move(fn); } @@ -371,21 +374,21 @@ void KernelGenerator::visit(const ir::operation::Reshape &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); // optional 2nd input - IPortableTensor *shape_alloc = nullptr; + IPortableTensor *shape_tensor = nullptr; if (node.getInputs().size() == 2) { const auto shape_index{node.getInputs().at(ir::operation::Reshape::Input::SHAPE)}; - shape_alloc = _tensor_builder->portableAt(shape_index).get(); + shape_tensor = _tensor_builder->portableAt(shape_index).get(); } auto fn = std::make_unique(); - fn->configure(input_alloc, shape_alloc, output_alloc); + fn->configure(input_tensor, shape_tensor, output_tensor); _return_fn = std::move(fn); } @@ -394,13 +397,13 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); // Squeeze can share same kernel with reshape auto fn = std::make_unique(); - fn->configure(input_alloc, nullptr, output_alloc); + fn->configure(input_tensor, nullptr, output_tensor); _return_fn = std::move(fn); } @@ -412,12 +415,12 @@ void KernelGenerator::visit(const ir::operation::Softmax &node) const auto beta = node.param().beta; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); auto fn = std::make_unique(); - fn->configure(input_alloc, beta, output_alloc); + fn->configure(input_tensor, beta, output_tensor); _return_fn = std::move(fn); } @@ -430,13 +433,13 @@ void KernelGenerator::visit(const ir::operation::Add &node) const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); - auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); + auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); auto fn = std::make_unique(); - fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc); + fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor); _return_fn = std::move(fn); } @@ -447,15 +450,15 @@ void KernelGenerator::visit(const ir::operation::Comparison &node) const auto lhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)}; const auto rhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); - auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); + auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); auto comparison_type = node.param().comparison_type; auto fn = std::make_unique(); - fn->configure(lhs_alloc, rhs_alloc, comparison_type, ofm_alloc); + fn->configure(lhs_tensor, rhs_tensor, comparison_type, ofm_tensor); _return_fn = std::move(fn); } @@ -466,11 +469,11 @@ void KernelGenerator::visit(const ir::operation::Gather &node) const auto input_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)}; const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); - auto indices_alloc = _tensor_builder->portableAt(indices_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto indices_tensor = _tensor_builder->portableAt(indices_index).get(); - const auto backend_layout = output_alloc->layout(); + const auto backend_layout = output_tensor->layout(); UNUSED_RELEASE(backend_layout); // NOTE The frontend layout and backend layout must be the same for this operation. @@ -481,8 +484,8 @@ void KernelGenerator::visit(const ir::operation::Gather &node) // a model. For example, if a model in NHWC has this operation as output rank == 4, indices // rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W // and C are not sequential in NCHW. So the backend in NCHW cannot handle this case. - assert(backend_layout == input_alloc->layout()); - assert(backend_layout == indices_alloc->layout()); + assert(backend_layout == input_tensor->layout()); + assert(backend_layout == indices_tensor->layout()); const auto &input_shape = _ctx.at(input_index).shape(); UNUSED_RELEASE(input_shape); assert(input_shape.rank() < 4 || _current_op_seq_layout == backend_layout); @@ -492,7 +495,7 @@ void KernelGenerator::visit(const ir::operation::Gather &node) auto fn = std::make_unique(); - fn->configure(input_alloc, indices_alloc, output_alloc, axis_value); + fn->configure(input_tensor, indices_tensor, output_tensor, axis_value); _return_fn = std::move(fn); } @@ -506,13 +509,13 @@ void KernelGenerator::visit(const ir::operation::Sub &node) const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); - auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); + auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); auto fn = std::make_unique(); - fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc); + fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor); _return_fn = std::move(fn); } @@ -526,13 +529,13 @@ void KernelGenerator::visit(const ir::operation::Mul &node) const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); - auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); + auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); auto fn = std::make_unique(); - fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc); + fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor); _return_fn = std::move(fn); } @@ -547,18 +550,18 @@ void KernelGenerator::visit(const ir::operation::OneHot &node) const auto axis = node.param().axis; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto indices_alloc = _tensor_builder->portableAt(indices_index).get(); - auto depth_alloc = _tensor_builder->portableAt(depth_index).get(); - auto onvalue_alloc = _tensor_builder->portableAt(onvalue_index).get(); - auto offvalue_alloc = _tensor_builder->portableAt(offvalue_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto indices_tensor = _tensor_builder->portableAt(indices_index).get(); + auto depth_tensor = _tensor_builder->portableAt(depth_index).get(); + auto onvalue_tensor = _tensor_builder->portableAt(onvalue_index).get(); + auto offvalue_tensor = _tensor_builder->portableAt(offvalue_index).get(); - assert(indices_alloc->data_type() == OperandType::INT32); - assert(axis <= static_cast(indices_alloc->num_dimensions())); + assert(indices_tensor->data_type() == OperandType::INT32); + assert(axis <= static_cast(indices_tensor->num_dimensions())); auto fn = std::make_unique(); - fn->configure(indices_alloc, depth_alloc, onvalue_alloc, offvalue_alloc, output_alloc, axis); + fn->configure(indices_tensor, depth_tensor, onvalue_tensor, offvalue_tensor, output_tensor, axis); _return_fn = std::move(fn); } @@ -572,13 +575,13 @@ void KernelGenerator::visit(const ir::operation::Div &node) const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); - auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); + auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); auto fn = std::make_unique(); - fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc); + fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor); _return_fn = std::move(fn); } @@ -587,16 +590,16 @@ void KernelGenerator::visit(const ir::operation::Einsum &node) { const auto ofm_index{node.getOutputs().at(0)}; - auto output_alloc = _tensor_builder->portableAt(ofm_index).get(); - std::vector input_allocs; + auto output_tensor = _tensor_builder->portableAt(ofm_index).get(); + std::vector input_tensors; for (auto &ifm_idx : node.getInputs()) - input_allocs.emplace_back(_tensor_builder->portableAt(ifm_idx).get()); + input_tensors.emplace_back(_tensor_builder->portableAt(ifm_idx).get()); const auto equation = node.param().equation; auto fn = std::make_unique(); - fn->configure(input_allocs, equation, output_alloc); + fn->configure(input_tensors, equation, output_tensor); _return_fn = std::move(fn); } @@ -605,14 +608,14 @@ void KernelGenerator::visit(const ir::operation::Custom &node) { auto fill_op_info = [&](const ir::OperandIndexSequence &opSeq, std::vector &types, - std::vector> &allocs) { + std::vector> &tensors) { for (auto &idx : opSeq) { const auto &operand = _ctx.at(idx); // TODO make sure using `_current_op_seq_layout` is correct for custom operations types.emplace_back(custom::TypeInfo{operand.shape(), operand.typeInfo().type()}); - auto in_alloc = _tensor_builder->portableAt(idx); - allocs.emplace_back(in_alloc); + auto in_tensor = _tensor_builder->portableAt(idx); + tensors.emplace_back(in_tensor); } }; @@ -634,12 +637,12 @@ void KernelGenerator::visit(const ir::operation::Exp &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); auto fn = std::make_unique(); - fn->configure(input_alloc, output_alloc); + fn->configure(input_tensor, output_tensor); _return_fn = std::move(fn); } @@ -650,13 +653,13 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node) const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)}; const auto axis_index{node.getInputs().at(ir::operation::ExpandDims::Input::AXIS)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); - auto axis_alloc = _tensor_builder->portableAt(axis_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto axis_tensor = _tensor_builder->portableAt(axis_index).get(); auto fn = std::make_unique(); - fn->configure(input_alloc, axis_alloc, output_alloc); + fn->configure(input_tensor, axis_tensor, output_tensor); _return_fn = std::move(fn); } @@ -666,12 +669,12 @@ void KernelGenerator::visit(const ir::operation::Logistic &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); auto fn = std::make_unique(); - fn->configure(input_alloc, output_alloc); + fn->configure(input_tensor, output_tensor); _return_fn = std::move(fn); } @@ -681,12 +684,12 @@ void KernelGenerator::visit(const ir::operation::Tanh &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); auto fn = std::make_unique(); - fn->configure(input_alloc, output_alloc); + fn->configure(input_tensor, output_tensor); _return_fn = std::move(fn); } @@ -700,7 +703,7 @@ void KernelGenerator::visit(const ir::operation::Pack &node) assert(-rank <= axis && axis < rank); - auto output_alloc = _tensor_builder->portableAt(ofm_index).get(); + auto output_tensor = _tensor_builder->portableAt(ofm_index).get(); std::vector input_tensors; for (auto &ifm_idx : node.getInputs()) @@ -708,7 +711,7 @@ void KernelGenerator::visit(const ir::operation::Pack &node) auto fn = std::make_unique(); - fn->configure(input_tensors, axis, output_alloc); + fn->configure(input_tensors, axis, output_tensor); _return_fn = std::move(fn); } @@ -722,7 +725,7 @@ void KernelGenerator::visit(const ir::operation::Unpack &node) assert(rank == 0 || (-rank <= axis && axis < rank)); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); std::vector output_tensors; for (auto &output_idx : node.getOutputs()) @@ -732,7 +735,7 @@ void KernelGenerator::visit(const ir::operation::Unpack &node) uint32_t axis_resolved = (axis < 0 ? axis + rank : axis); - fn->configure(input_alloc, axis_resolved, node.param().num, output_tensors); + fn->configure(input_tensor, axis_resolved, node.param().num, output_tensors); _return_fn = std::move(fn); } @@ -751,8 +754,16 @@ void KernelGenerator::visit(const ir::operation::Pad &node) auto fn = std::make_unique(); - fn->configure(input, output, pad_base, pad_rank); + bool isPadV2 = node.getInputs().size() == 3 ? true : false; + const void *value = nullptr; + if (isPadV2) + { + const auto value_index{node.getInputs().at(ir::operation::Pad::Input::VALUE)}; + value = reinterpret_cast(_ctx.at(value_index).data()->base()); + } + + fn->configure(input, output, pad_base, pad_rank, value); _return_fn = std::move(fn); } @@ -762,13 +773,13 @@ void KernelGenerator::visit(const ir::operation::Max &node) const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); - auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); + auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); auto fn = std::make_unique(); - fn->configure(lhs_alloc, rhs_alloc, ofm_alloc); + fn->configure(lhs_tensor, rhs_tensor, ofm_tensor); _return_fn = std::move(fn); } @@ -779,13 +790,13 @@ void KernelGenerator::visit(const ir::operation::Min &node) const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); - auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); + auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); auto fn = std::make_unique(); - fn->configure(lhs_alloc, rhs_alloc, ofm_alloc); + fn->configure(lhs_tensor, rhs_tensor, ofm_tensor); _return_fn = std::move(fn); } @@ -795,12 +806,12 @@ void KernelGenerator::visit(const ir::operation::Cast &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); auto fn = std::make_unique(); - fn->configure(ifm_alloc, ofm_alloc); + fn->configure(ifm_tensor, ofm_tensor); _return_fn = std::move(fn); } @@ -810,12 +821,12 @@ void KernelGenerator::visit(const ir::operation::Transpose &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Transpose::Input::INPUT)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); auto fn = std::make_unique(); - fn->configure(input_alloc, output_alloc, node.param().perm); + fn->configure(input_tensor, output_tensor, node.param().perm); _return_fn = std::move(fn); } @@ -827,15 +838,15 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)}; const auto keep_dims = node.param().keep_dims; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); - auto axes_alloc = _tensor_builder->portableAt(axes_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto axes_tensor = _tensor_builder->portableAt(axes_index).get(); if (node.param().reduce_type == ir::operation::Reduce::ReduceType::MEAN) { auto fn = std::make_unique(); - fn->configure(input_alloc, axes_alloc, output_alloc, keep_dims); + fn->configure(input_tensor, axes_tensor, output_tensor, keep_dims); _return_fn = std::move(fn); } @@ -844,7 +855,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) auto fn = std::make_unique(); const auto reduce_type = convertReduceType(node.param().reduce_type); - fn->configure(input_alloc, axes_alloc, output_alloc, reduce_type, keep_dims); + fn->configure(input_tensor, axes_tensor, output_tensor, reduce_type, keep_dims); _return_fn = std::move(fn); } @@ -855,12 +866,12 @@ void KernelGenerator::visit(const ir::operation::ReLU &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(0)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); auto fn = std::make_unique(); - fn->configure(input_alloc, output_alloc); + fn->configure(input_tensor, output_tensor); _return_fn = std::move(fn); } @@ -872,14 +883,14 @@ void KernelGenerator::visit(const ir::operation::Select &node) const auto true_index{node.getInputs().at(ir::operation::Select::Input::INPUT_TRUE)}; const auto false_index{node.getInputs().at(ir::operation::Select::Input::INPUT_FALSE)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto condition_alloc = _tensor_builder->portableAt(condition_index).get(); - auto true_alloc = _tensor_builder->portableAt(true_index).get(); - auto false_alloc = _tensor_builder->portableAt(false_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto condition_tensor = _tensor_builder->portableAt(condition_index).get(); + auto true_tensor = _tensor_builder->portableAt(true_index).get(); + auto false_tensor = _tensor_builder->portableAt(false_index).get(); auto fn = std::make_unique(); - fn->configure(condition_alloc, true_alloc, false_alloc, output_alloc); + fn->configure(condition_tensor, true_tensor, false_tensor, output_tensor); _return_fn = std::move(fn); } @@ -891,14 +902,14 @@ void KernelGenerator::visit(const ir::operation::Slice &node) const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)}; const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); - auto begins_alloc = _tensor_builder->portableAt(begins_index).get(); - auto sizes_alloc = _tensor_builder->portableAt(sizes_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto begins_tensor = _tensor_builder->portableAt(begins_index).get(); + auto sizes_tensor = _tensor_builder->portableAt(sizes_index).get(); auto fn = std::make_unique(); - fn->configure(input_alloc, begins_alloc, sizes_alloc, output_alloc); + fn->configure(input_tensor, begins_tensor, sizes_tensor, output_tensor); _return_fn = std::move(fn); } @@ -911,11 +922,11 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node) const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)}; const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); - auto starts_alloc = _tensor_builder->portableAt(starts_index).get(); - auto ends_alloc = _tensor_builder->portableAt(ends_index).get(); - auto strides_alloc = _tensor_builder->portableAt(strides_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto starts_tensor = _tensor_builder->portableAt(starts_index).get(); + auto ends_tensor = _tensor_builder->portableAt(ends_index).get(); + auto strides_tensor = _tensor_builder->portableAt(strides_index).get(); auto begin_mask = node.param().begin_mask; auto end_mask = node.param().end_mask; @@ -923,7 +934,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node) auto fn = std::make_unique(); - fn->configure(input_alloc, starts_alloc, ends_alloc, strides_alloc, output_alloc, begin_mask, + fn->configure(input_tensor, starts_tensor, ends_tensor, strides_tensor, output_tensor, begin_mask, end_mask, shrink_axis_mask); _return_fn = std::move(fn); @@ -957,12 +968,12 @@ void KernelGenerator::visit(const ir::operation::Abs &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); auto fn = std::make_unique(); - fn->configure(ifm_alloc, ofm_alloc); + fn->configure(ifm_tensor, ofm_tensor); _return_fn = std::move(fn); } @@ -972,12 +983,12 @@ void KernelGenerator::visit(const ir::operation::Sin &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Sin::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); auto fn = std::make_unique(); - fn->configure(ifm_alloc, ofm_alloc); + fn->configure(ifm_tensor, ofm_tensor); _return_fn = std::move(fn); } @@ -987,12 +998,12 @@ void KernelGenerator::visit(const ir::operation::Cos &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Cos::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); auto fn = std::make_unique(); - fn->configure(ifm_alloc, ofm_alloc); + fn->configure(ifm_tensor, ofm_tensor); _return_fn = std::move(fn); } @@ -1002,12 +1013,12 @@ void KernelGenerator::visit(const ir::operation::RSQRT &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); auto fn = std::make_unique(); - fn->configure(ifm_alloc, ofm_alloc); + fn->configure(ifm_tensor, ofm_tensor); _return_fn = std::move(fn); } @@ -1017,12 +1028,12 @@ void KernelGenerator::visit(const ir::operation::Shape &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Shape::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); auto fn = std::make_unique(); - fn->configure(ifm_alloc, ofm_alloc); + fn->configure(ifm_tensor, ofm_tensor); _return_fn = std::move(fn); } @@ -1033,13 +1044,13 @@ void KernelGenerator::visit(const ir::operation::Reverse &node) const auto input_index{node.getInputs().at(ir::operation::Reverse::INPUT)}; const auto axis_index{node.getInputs().at(ir::operation::Reverse::AXIS)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); - auto axis_alloc = _tensor_builder->portableAt(axis_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto axis_tensor = _tensor_builder->portableAt(axis_index).get(); auto fn = std::make_unique(); - fn->configure(input_alloc, axis_alloc, output_alloc); + fn->configure(input_tensor, axis_tensor, output_tensor); _return_fn = std::move(fn); } @@ -1049,12 +1060,12 @@ void KernelGenerator::visit(const ir::operation::Neg &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); auto fn = std::make_unique(); - fn->configure(ifm_alloc, ofm_alloc); + fn->configure(ifm_tensor, ofm_tensor); _return_fn = std::move(fn); } @@ -1066,12 +1077,12 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node) const auto axis = node.param().axis; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); auto fn = std::make_unique(); - fn->configure(input_alloc, output_alloc, axis, /* is_arg_max */ true); + fn->configure(input_tensor, output_tensor, axis, /* is_arg_max */ true); _return_fn = std::move(fn); } @@ -1082,13 +1093,13 @@ void KernelGenerator::visit(const ir::operation::Pow &node) const auto lhs_index{node.getInputs().at(ir::operation::Pow::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::Pow::RHS)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); - auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); + auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); auto fn = std::make_unique(); - fn->configure(lhs_alloc, rhs_alloc, ir::Activation::NONE, output_alloc); + fn->configure(lhs_tensor, rhs_tensor, ir::Activation::NONE, output_tensor); _return_fn = std::move(fn); } @@ -1098,12 +1109,12 @@ void KernelGenerator::visit(const ir::operation::Log &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Log::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); auto fn = std::make_unique(); - fn->configure(ifm_alloc, ofm_alloc); + fn->configure(ifm_tensor, ofm_tensor); _return_fn = std::move(fn); } @@ -1113,12 +1124,12 @@ void KernelGenerator::visit(const ir::operation::Round &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Round::INPUT)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); auto fn = std::make_unique(); - fn->configure(input_alloc, output_alloc); + fn->configure(input_tensor, output_tensor); _return_fn = std::move(fn); } @@ -1128,12 +1139,12 @@ void KernelGenerator::visit(const ir::operation::LogicalNot &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::LogicalNot::INPUT)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); auto fn = std::make_unique(); - fn->configure(input_alloc, output_alloc); + fn->configure(input_tensor, output_tensor); _return_fn = std::move(fn); } @@ -1144,28 +1155,43 @@ void KernelGenerator::visit(const ir::operation::LogicalOr &node) const auto lhs_index{node.getInputs().at(0)}; const auto rhs_index{node.getInputs().at(1)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); - auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); + auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); auto fn = std::make_unique(); - fn->configure(lhs_alloc, rhs_alloc, ofm_alloc); + fn->configure(lhs_tensor, rhs_tensor, ofm_tensor); _return_fn = std::move(fn); } -void KernelGenerator::visit(const ir::operation::ZerosLike &node) +void KernelGenerator::visit(const ir::operation::L2Normalization &node) { const auto output_index{node.getOutputs().at(0)}; - const auto input_index{node.getInputs().at(ir::operation::ZerosLike::INPUT)}; + const auto input_index{node.getInputs().at(0)}; auto output_alloc = _tensor_builder->portableAt(output_index).get(); auto input_alloc = _tensor_builder->portableAt(input_index).get(); - auto fn = std::make_unique(); + auto fn = std::make_unique(); fn->configure(input_alloc, output_alloc); + + _return_fn = std::move(fn); +} + +void KernelGenerator::visit(const ir::operation::ZerosLike &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::ZerosLike::INPUT)}; + + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + + auto fn = std::make_unique(); + + fn->configure(input_tensor, output_tensor); _return_fn = std::move(fn); } @@ -1176,14 +1202,14 @@ void KernelGenerator::visit(const ir::operation::Range &node) const auto limit_index{node.getInputs().at(ir::operation::Range::LIMIT)}; const auto delta_index{node.getInputs().at(ir::operation::Range::DELTA)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto start_alloc = _tensor_builder->portableAt(start_index).get(); - auto limit_alloc = _tensor_builder->portableAt(limit_index).get(); - auto delta_alloc = _tensor_builder->portableAt(delta_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto start_tensor = _tensor_builder->portableAt(start_index).get(); + auto limit_tensor = _tensor_builder->portableAt(limit_index).get(); + auto delta_tensor = _tensor_builder->portableAt(delta_index).get(); auto fn = std::make_unique(); - fn->configure(start_alloc, limit_alloc, delta_alloc, output_alloc); + fn->configure(start_tensor, limit_tensor, delta_tensor, output_tensor); _return_fn = std::move(fn); } @@ -1193,13 +1219,13 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node) const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); - auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); + auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); auto fn = std::make_unique(); - fn->configure(lhs_alloc, rhs_alloc, ofm_alloc); + fn->configure(lhs_tensor, rhs_tensor, ofm_tensor); _return_fn = std::move(fn); } @@ -1209,13 +1235,13 @@ void KernelGenerator::visit(const ir::operation::Tile &node) const auto input_index{node.getInputs().at(ir::operation::Tile::INPUT)}; const auto multiples_index{node.getInputs().at(ir::operation::Tile::MULTIPLES)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); - auto multiples_alloc = _tensor_builder->portableAt(multiples_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto multiples_tensor = _tensor_builder->portableAt(multiples_index).get(); auto fn = std::make_unique(); - fn->configure(input_alloc, multiples_alloc, output_alloc); + fn->configure(input_tensor, multiples_tensor, output_tensor); _return_fn = std::move(fn); } @@ -1226,14 +1252,14 @@ void KernelGenerator::visit(const ir::operation::MatrixBandPart &node) const auto num_lower_index{node.getInputs().at(ir::operation::MatrixBandPart::NUM_LOWER_DIAG)}; const auto num_upper_index{node.getInputs().at(ir::operation::MatrixBandPart::NUM_UPPER_DIAG)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); - auto num_lower_alloc = _tensor_builder->portableAt(num_lower_index).get(); - auto num_upper_alloc = _tensor_builder->portableAt(num_upper_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto num_lower_tensor = _tensor_builder->portableAt(num_lower_index).get(); + auto num_upper_tensor = _tensor_builder->portableAt(num_upper_index).get(); auto fn = std::make_unique(); - fn->configure(input_alloc, num_lower_alloc, num_upper_alloc, output_alloc); + fn->configure(input_tensor, num_lower_tensor, num_upper_tensor, output_tensor); _return_fn = std::move(fn); } @@ -1243,16 +1269,16 @@ void KernelGenerator::visit(const ir::operation::BatchMatMul &node) const auto lhs_index{node.getInputs().at(ir::operation::BatchMatMul::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::BatchMatMul::RHS)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); - auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); + auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); const auto adj_x = node.param().adj_x; const auto adj_y = node.param().adj_y; auto fn = std::make_unique(); - fn->configure(lhs_alloc, rhs_alloc, adj_x, adj_y, output_alloc); + fn->configure(lhs_tensor, rhs_tensor, adj_x, adj_y, output_tensor); _return_fn = std::move(fn); } @@ -1262,13 +1288,13 @@ void KernelGenerator::visit(const ir::operation::BroadcastTo &node) const auto input_index{node.getInputs().at(ir::operation::BroadcastTo::INPUT)}; const auto shape_index{node.getInputs().at(ir::operation::BroadcastTo::SHAPE)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); - auto shape_alloc = _tensor_builder->portableAt(shape_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto shape_tensor = _tensor_builder->portableAt(shape_index).get(); auto fn = std::make_unique(); - fn->configure(input_alloc, shape_alloc, output_alloc); + fn->configure(input_tensor, shape_tensor, output_tensor); _return_fn = std::move(fn); } @@ -1277,10 +1303,10 @@ void KernelGenerator::visit(const ir::operation::FusedBatchNorm &node) { const auto ofm_index{node.getOutputs().at(0)}; - auto output_alloc = _tensor_builder->portableAt(ofm_index).get(); - std::vector input_allocs; + auto output_tensor = _tensor_builder->portableAt(ofm_index).get(); + std::vector input_tensors; for (auto &ifm_idx : node.getInputs()) - input_allocs.emplace_back(_tensor_builder->portableAt(ifm_idx).get()); + input_tensors.emplace_back(_tensor_builder->portableAt(ifm_idx).get()); const auto epsilon = node.param().epsilon; const auto is_training = node.param().is_training; @@ -1288,7 +1314,7 @@ void KernelGenerator::visit(const ir::operation::FusedBatchNorm &node) auto fn = std::make_unique(); - fn->configure(input_allocs, epsilon, is_training, data_format, output_alloc); + fn->configure(input_tensors, epsilon, is_training, data_format, output_tensor); _return_fn = std::move(fn); } @@ -1301,12 +1327,12 @@ void KernelGenerator::visit(const ir::operation::LogSoftmax &node) const auto beta = node.param().beta; const auto axis = node.param().axis; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); auto fn = std::make_unique(); - fn->configure(input_alloc, beta, axis, output_alloc); + fn->configure(input_tensor, beta, axis, output_tensor); _return_fn = std::move(fn); } @@ -1318,14 +1344,45 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node) const auto block_shape_index{node.getInputs().at(ir::operation::SpaceToBatchND::BLOCK_SIZE)}; const auto padding_index{node.getInputs().at(ir::operation::SpaceToBatchND::PADDINGS)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); - auto block_shape_alloc = _tensor_builder->portableAt(block_shape_index).get(); - auto padding_alloc = _tensor_builder->portableAt(padding_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto block_shape_tensor = _tensor_builder->portableAt(block_shape_index).get(); + auto padding_tensor = _tensor_builder->portableAt(padding_index).get(); auto fn = std::make_unique(); - fn->configure(input_alloc, block_shape_alloc, padding_alloc, output_alloc); + fn->configure(input_tensor, block_shape_tensor, padding_tensor, output_tensor); + + _return_fn = std::move(fn); +} + +void KernelGenerator::visit(const ir::operation::Quantize &node) +{ + const auto input_index{node.getInputs().at(ir::operation::Quantize::Input::INPUT)}; + const auto output_index{node.getOutputs().at(0)}; + + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + + auto fn = std::make_unique(); + + fn->configure(input_tensor, output_tensor); + + _return_fn = std::move(fn); +} + +void KernelGenerator::visit(const ir::operation::SpaceToDepth &node) +{ + const auto input_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)}; + const auto output_index{node.getOutputs().at(0)}; + auto block_size = node.param().block_size; + + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + + auto fn = std::make_unique(); + + fn->configure(input_tensor, block_size, output_tensor); _return_fn = std::move(fn); } diff --git a/runtime/onert/backend/cpu/KernelGenerator.h b/runtime/onert/backend/cpu/KernelGenerator.h index d6f4c28..f564bf8 100644 --- a/runtime/onert/backend/cpu/KernelGenerator.h +++ b/runtime/onert/backend/cpu/KernelGenerator.h @@ -94,6 +94,7 @@ public: void visit(const ir::operation::SquaredDifference &) override; void visit(const ir::operation::Tile &) override; void visit(const ir::operation::LogicalOr &) override; + void visit(const ir::operation::L2Normalization &) override; void visit(const ir::operation::Range &) override; void visit(const ir::operation::MatrixBandPart &) override; void visit(const ir::operation::BatchMatMul &) override; @@ -101,6 +102,8 @@ public: void visit(const ir::operation::FusedBatchNorm &) override; void visit(const ir::operation::LogSoftmax &) override; void visit(const ir::operation::SpaceToBatchND &) override; + void visit(const ir::operation::Quantize &) override; + void visit(const ir::operation::SpaceToDepth &) override; private: const ir::Operands &_ctx; diff --git a/runtime/onert/backend/cpu/StaticTensorManager.cc b/runtime/onert/backend/cpu/StaticTensorManager.cc new file mode 100644 index 0000000..8723072 --- /dev/null +++ b/runtime/onert/backend/cpu/StaticTensorManager.cc @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "StaticTensorManager.h" +#include "Tensor.h" + +#include + +namespace onert +{ +namespace backend +{ +namespace cpu +{ + +StaticTensorManager::StaticTensorManager(const std::shared_ptr ®) + : _nonconst_mgr{new cpu_common::MemoryManager()}, _tensors{reg} +{ + // DO NOTHING +} + +void StaticTensorManager::allocateNonconsts(void) +{ + _nonconst_mgr->allocate(); + + for (auto &pair : _tensors->native_tensors()) + { + const auto &ind = pair.first; + auto tensor = pair.second; + if (!_as_constants[ind] && !tensor->is_dynamic()) + { + auto *buffer = _nonconst_mgr->getBuffer(ind); + tensor->setBuffer(buffer); + + VERBOSE(CPU_StaticTensorManager) << "TENSOR(#" << ind.value() + << "): " << static_cast(buffer) << std::endl; + } + } +} + +void StaticTensorManager::deallocateNonconsts(void) { _nonconst_mgr->deallocate(); } + +void StaticTensorManager::buildTensor(const ir::OperandIndex &ind, + const ir::OperandInfo &tensor_info, ir::Layout backend_layout, + bool as_const) +{ + assert(!_tensors->getITensor(ind)); + if (as_const) + { + auto tensor = std::make_shared(tensor_info, backend_layout); + _tensors->setNativeTensor(ind, tensor); + } + else + { + auto tensor = std::make_shared(tensor_info, backend_layout); + _tensors->setNativeTensor(ind, tensor); + } + _as_constants[ind] = as_const; +} + +void StaticTensorManager::claimPlan(const ir::OperandIndex &ind, uint32_t size) +{ + assert(_tensors->getITensor(ind)); + + // This method is called only when a tensor has proper shape + assert(!_tensors->getITensor(ind)->is_dynamic()); + + if (!_as_constants[ind]) + _nonconst_mgr->claimPlan(ind, size); +} + +void StaticTensorManager::releasePlan(const ir::OperandIndex &ind) +{ + assert(_tensors->getITensor(ind)); + + // This method is called only when a tensor has proper shape + assert(!_tensors->getITensor(ind)->is_dynamic()); + + if (!_as_constants[ind]) + _nonconst_mgr->releasePlan(ind); +} + +void StaticTensorManager::iterate(const std::function &fn) +{ + for (const auto &it : _tensors->native_tensors()) + fn(it.first); +} + +} // namespace cpu +} // namespace backend +} // namespace onert diff --git a/runtime/onert/backend/cpu/StaticTensorManager.h b/runtime/onert/backend/cpu/StaticTensorManager.h new file mode 100644 index 0000000..66243a5 --- /dev/null +++ b/runtime/onert/backend/cpu/StaticTensorManager.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__ +#define __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__ + +#include "backend/IStaticTensorManager.h" +#include "backend/cpu_common/MemoryManager.h" +#include "backend/cpu_common/TensorRegistry.h" +#include "backend/ITensorManager.h" +#include "ir/OperandIndexMap.h" +#include "ir/OperandInfo.h" + +namespace onert +{ +namespace backend +{ +namespace cpu +{ + +class StaticTensorManager : public backend::IStaticTensorManager +{ +public: + StaticTensorManager(const std::shared_ptr ®); + virtual ~StaticTensorManager() = default; + + void allocateNonconsts(void); + void deallocateNonconsts(void); + + void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &tensor_info, + ir::Layout backend_layout, bool as_const); + + void claimPlan(const ir::OperandIndex &ind, uint32_t size); + void releasePlan(const ir::OperandIndex &ind); + + void iterate(const std::function &fn); + +private: + std::unique_ptr _nonconst_mgr; + const std::shared_ptr _tensors; + ir::OperandIndexMap _as_constants; +}; + +} // namespace cpu +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__ diff --git a/runtime/onert/backend/cpu/Tensor.h b/runtime/onert/backend/cpu/Tensor.h index 4dd251b..da16d05 100644 --- a/runtime/onert/backend/cpu/Tensor.h +++ b/runtime/onert/backend/cpu/Tensor.h @@ -29,8 +29,14 @@ namespace cpu using Tensor = cpu_common::Tensor; -// Tensor which has data from external. To support this, assume below things -// no padding, always NHWC layout, constant tensor and not dynamic +/** + * @brief Class that uses data from external memory that is not managed by a backend + * instead of allocating and copying the data. ExternalTensor's data pointer points to + * an address of memory such as where memory is already allocated, or mmapped area. + * This is meaning that ExternalTensor can take all of types' ir::Data. + * To support this, assume below things no padding, always NHWC layout, + * constant tensor and not dynamic. + */ class ExternalTensor : public Tensor { public: @@ -45,6 +51,11 @@ public: } public: + /** + * @brief set Data to be shared from external so that this ExternalTensor will not be + * allocated on CPU backend + * @param[in] data data of Operand to be set + */ void setData(const std::shared_ptr data) { assert(data != nullptr); diff --git a/runtime/onert/backend/cpu/TensorBuilder.cc b/runtime/onert/backend/cpu/TensorBuilder.cc index 886e8d8..7eb3ce8 100644 --- a/runtime/onert/backend/cpu/TensorBuilder.cc +++ b/runtime/onert/backend/cpu/TensorBuilder.cc @@ -29,7 +29,7 @@ namespace cpu TensorBuilder::TensorBuilder() : _tensor_reg{new cpu_common::TensorRegistry()}, - _static_tensor_mgr{new cpu_common::StaticTensorManager(_tensor_reg)}, + _static_tensor_mgr{new StaticTensorManager(_tensor_reg)}, _dynamic_tensor_mgr{new cpu_common::DynamicTensorManager(_tensor_reg)} { /* empty */ @@ -77,11 +77,7 @@ bool TensorBuilder::isRegistered(const ir::OperandIndex &ind) const return _tensor_info_map.find(ind) != _tensor_info_map.end(); } -void TensorBuilder::prepare(void) -{ - _static_tensor_mgr->allocateConsts(); - _static_tensor_mgr->allocateNonconsts(); -} +void TensorBuilder::prepare(void) { _static_tensor_mgr->allocateNonconsts(); } void TensorBuilder::allocate() { @@ -99,17 +95,17 @@ std::shared_ptr TensorBuilder::portableAt(const ir::OperandInde return _tensor_reg->getPortableTensor(ind); } -bool TensorBuilder::setExternalTensor(const ir::OperandIndex &ind, - const std::shared_ptr &tensor) +bool TensorBuilder::setMigrantTensor(const ir::OperandIndex &ind, + const std::shared_ptr &tensor) { - return _tensor_reg->setExternalTensor(ind, tensor); + return _tensor_reg->setMigrantTensor(ind, tensor); } void TensorBuilder::iterate(const IterateFunction &fn) { _static_tensor_mgr->iterate(fn); } -std::shared_ptr TensorBuilder::at(const ir::OperandIndex &ind) +std::shared_ptr TensorBuilder::at(const ir::OperandIndex &ind) { - return _tensor_reg->getManagedTensor(ind); + return _tensor_reg->getNativeTensor(ind); } std::unique_ptr TensorBuilder::releaseStaticTensorManager(void) diff --git a/runtime/onert/backend/cpu/TensorBuilder.h b/runtime/onert/backend/cpu/TensorBuilder.h index ba25451..12ca28c 100644 --- a/runtime/onert/backend/cpu/TensorBuilder.h +++ b/runtime/onert/backend/cpu/TensorBuilder.h @@ -18,13 +18,14 @@ #define __ONERT_BACKEND_CPU_TENSOR_BUILDER_H__ #include -#include #include -#include #include #include +#include "StaticTensorManager.h" +#include "Tensor.h" + #include namespace onert @@ -80,16 +81,16 @@ public: * If not, program will crash with assert or exception. * @return shared_ptr */ - std::shared_ptr at(const ir::OperandIndex &ind); + std::shared_ptr at(const ir::OperandIndex &ind); std::shared_ptr portableAt(const ir::OperandIndex &ind); - bool setExternalTensor(const ir::OperandIndex &ind, - const std::shared_ptr &tensor) override; + bool setMigrantTensor(const ir::OperandIndex &ind, + const std::shared_ptr &tensor) override; std::shared_ptr tensorRegistry() override { return _tensor_reg; } private: const std::shared_ptr _tensor_reg; - std::unique_ptr _static_tensor_mgr; + std::unique_ptr _static_tensor_mgr; std::unique_ptr _dynamic_tensor_mgr; ir::OperandIndexMap _tensor_info_map; }; diff --git a/runtime/onert/backend/cpu/ops/CompareLayer.cc b/runtime/onert/backend/cpu/ops/CompareLayer.cc index f557f3a..adf902a 100644 --- a/runtime/onert/backend/cpu/ops/CompareLayer.cc +++ b/runtime/onert/backend/cpu/ops/CompareLayer.cc @@ -17,6 +17,7 @@ #include "OperationUtils.h" +#include #include using namespace nnfw::cker; namespace onert @@ -34,6 +35,14 @@ namespace using OpType = onert::ir::operation::Comparison::ComparisonType; using namespace onert::backend::cpu; +// Assumes these enum values to be in the order like this +static_assert(static_cast(OpType::Equal) == 0, "An OpType value has changed!"); +static_assert(static_cast(OpType::NotEqual) == 1, "An OpType value has changed!"); +static_assert(static_cast(OpType::Greater) == 2, "An OpType value has changed!"); +static_assert(static_cast(OpType::GreaterEqual) == 3, "An OpType value has changed!"); +static_assert(static_cast(OpType::Less) == 4, "An OpType value has changed!"); +static_assert(static_cast(OpType::LessEqual) == 5, "An OpType value has changed!"); + template void compareQuant8(const IPortableTensor *lhs, const IPortableTensor *rhs, IPortableTensor *output, OpType op_type) @@ -52,95 +61,33 @@ void compareQuant8(const IPortableTensor *lhs, const IPortableTensor *rhs, IPort ¶ms.input2_shift); params.is_broadcast = !HaveSameShapes(lhs, rhs); - if (params.is_broadcast) - { - switch (op_type) - { - case OpType::Equal: - Broadcast4DSlowEqualWithScaling( - params, getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast(output->buffer())); - break; - case OpType::NotEqual: - Broadcast4DSlowNotEqualWithScaling( - params, getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast(output->buffer())); - break; - case OpType::Greater: - Broadcast4DSlowGreaterWithScaling( - params, getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast(output->buffer())); - break; - case OpType::GreaterEqual: - Broadcast4DSlowGreaterEqualWithScaling( - params, getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast(output->buffer())); - break; - case OpType::Less: - Broadcast4DSlowLessWithScaling( - params, getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast(output->buffer())); - break; - case OpType::LessEqual: - Broadcast4DSlowLessEqualWithScaling( - params, getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast(output->buffer())); - break; - default: - throw std::runtime_error{"Invalid OpType for CompareLayer"}; - } - } - else // if (requires_broadcast == false) - { - switch (op_type) - { - case OpType::Equal: - EqualWithScaling(params, getExtendedTensorShape(lhs), - reinterpret_cast(lhs->buffer()), getExtendedTensorShape(rhs), - reinterpret_cast(rhs->buffer()), getExtendedTensorShape(output), - reinterpret_cast(output->buffer())); - break; - case OpType::NotEqual: - NotEqualWithScaling( - params, getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast(output->buffer())); - break; - case OpType::Greater: - GreaterWithScaling( - params, getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast(output->buffer())); - break; - case OpType::GreaterEqual: - GreaterEqualWithScaling( - params, getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast(output->buffer())); - break; - case OpType::Less: - LessWithScaling(params, getExtendedTensorShape(lhs), - reinterpret_cast(lhs->buffer()), getExtendedTensorShape(rhs), - reinterpret_cast(rhs->buffer()), getExtendedTensorShape(output), - reinterpret_cast(output->buffer())); - break; - case OpType::LessEqual: - LessEqualWithScaling( - params, getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast(output->buffer())); - break; - default: - throw std::runtime_error{"Invalid OpType for CompareLayer"}; - } - } - return; + using CompareFunction = + void (*)(ComparisonParams & params, const Shape &input1_shape, const T *input1_data, + const Shape &input2_shape, const T *input2_data, const Shape &output_shape, + bool *output_data); + + static const CompareFunction broadcast_fns[] = { + Broadcast4DSlowEqualWithScaling, Broadcast4DSlowNotEqualWithScaling, + Broadcast4DSlowGreaterWithScaling, Broadcast4DSlowGreaterEqualWithScaling, + Broadcast4DSlowLessWithScaling, Broadcast4DSlowLessEqualWithScaling, + }; + static const CompareFunction non_broadcast_fns[] = { + EqualWithScaling, NotEqualWithScaling, GreaterWithScaling, + GreaterEqualWithScaling, LessWithScaling, LessEqualWithScaling, + }; + + static_assert(sizeof(broadcast_fns) == sizeof(non_broadcast_fns), + "Sizes of broadcast_fns and non_broadcast_fns must match!"); + + auto index = static_cast(op_type); + if (index < 0 || index >= static_cast(sizeof(broadcast_fns) / sizeof(broadcast_fns[0]))) + throw std::runtime_error{"Invalid OpType for CompareLayer"}; + + CompareFunction fn = (params.is_broadcast ? broadcast_fns[index] : non_broadcast_fns[index]); + + fn(params, getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), + getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), + getExtendedTensorShape(output), reinterpret_cast(output->buffer())); } template @@ -149,94 +96,33 @@ void compareScalar(const IPortableTensor *lhs, const IPortableTensor *rhs, IPort { bool requires_broadcast = !HaveSameShapes(lhs, rhs); - if (requires_broadcast) - { - switch (op_type) - { - case OpType::Equal: - Broadcast4DSlowEqual( - getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast(output->buffer())); - break; - case OpType::NotEqual: - Broadcast4DSlowNotEqual( - getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast(output->buffer())); - break; - case OpType::Greater: - Broadcast4DSlowGreater( - getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast(output->buffer())); - break; - case OpType::GreaterEqual: - Broadcast4DSlowGreaterEqual( - getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast(output->buffer())); - break; - case OpType::Less: - Broadcast4DSlowLess(getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), - getExtendedTensorShape(output), - reinterpret_cast(output->buffer())); - break; - case OpType::LessEqual: - Broadcast4DSlowLessEqual( - getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast(output->buffer())); - break; - default: - throw std::runtime_error{"Invalid OpType for CompareLayer"}; - } - } - else // if (requires_broadcast == false) - { - switch (op_type) - { - case OpType::Equal: - EqualNoScaling(getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast(output->buffer())); - break; - case OpType::NotEqual: - NotEqualNoScaling(getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), - getExtendedTensorShape(output), - reinterpret_cast(output->buffer())); - break; - case OpType::Greater: - GreaterNoScaling(getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), - getExtendedTensorShape(output), - reinterpret_cast(output->buffer())); - break; - case OpType::GreaterEqual: - GreaterEqualNoScaling( - getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast(output->buffer())); - break; - case OpType::Less: - LessNoScaling(getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast(output->buffer())); - break; - case OpType::LessEqual: - LessEqualNoScaling(getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), - getExtendedTensorShape(output), - reinterpret_cast(output->buffer())); - break; - default: - throw std::runtime_error{"Invalid OpType for CompareLayer"}; - } - } - return; + using CompareFunction = + void (*)(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, + const T *input2_data, const Shape &output_shape, bool *output_data); + + static const CompareFunction broadcast_fns[] = { + Broadcast4DSlowEqual, Broadcast4DSlowNotEqual, Broadcast4DSlowGreater, + Broadcast4DSlowGreaterEqual, Broadcast4DSlowLess, Broadcast4DSlowLessEqual, + }; + static const CompareFunction non_broadcast_fns[] = { + EqualNoScaling, NotEqualNoScaling, GreaterNoScaling, + GreaterEqualNoScaling, LessNoScaling, LessEqualNoScaling, + }; + + static_assert(sizeof(broadcast_fns) == sizeof(non_broadcast_fns), + "Sizes of broadcast_fns and non_broadcast_fns must match!"); + + auto index = static_cast(op_type); + if (index < 0 || index >= static_cast(sizeof(broadcast_fns) / sizeof(broadcast_fns[0]))) + throw std::runtime_error{"Invalid OpType for CompareLayer"}; + + CompareFunction fn = (requires_broadcast ? broadcast_fns[index] : non_broadcast_fns[index]); + + fn(getExtendedTensorShape(lhs), reinterpret_cast(lhs->buffer()), + getExtendedTensorShape(rhs), reinterpret_cast(rhs->buffer()), + getExtendedTensorShape(output), reinterpret_cast(output->buffer())); } + } // namespace CompareLayer::CompareLayer() diff --git a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc index c00be64..ff22e32 100644 --- a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc +++ b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc @@ -18,6 +18,7 @@ #include "../Tensor.h" #include +#include namespace onert { @@ -112,15 +113,32 @@ void FullyConnectedLayer::fullyConnectedHybrid() getTensorShape(_bias), reinterpret_cast(_bias ? _bias->buffer() : nullptr), getTensorShape(_output), reinterpret_cast(_output->buffer()), temp_arena); -// TODO Enable calling decrease_ref -#if 0 +// TODO Remove this ifdef +#ifdef EXPERIMENTAL_RUY_FEATURE if (_cached_weights == nullptr || _is_weights_freed) return; + // '_cached_weights is not nullptr and _is_weights_freed is false' means + // this weight shape is satisfied with the ruy kernel's prepack cache's condition. + // After entering here, it will not enter again except below the case - input is zero-vector + + // if input's elements are filled with zero, it by-passes(does not enter ruy-kernel path) + // so that handle this case + const int input_size = getTensorShape(_input).FlatSize(); + if (nnfw::cker::IsZeroVector(reinterpret_cast(_input->buffer()), input_size)) + return; + + // This weight tensor could be other ops' const tensor. + // Therefore, below reference should be checked like following auto weight_tensor = dynamic_cast(_weights); if (weight_tensor) { auto tensor = const_cast(weight_tensor); + if (tensor->buffer() == nullptr) // ref is already 0? + { + _is_weights_freed = true; + return; + } tensor->decrease_ref(); if (tensor->buffer() == nullptr) // ref == 0? @@ -128,7 +146,7 @@ void FullyConnectedLayer::fullyConnectedHybrid() _is_weights_freed = true; } } -#endif // if 0 +#endif #endif } @@ -167,7 +185,17 @@ void FullyConnectedLayer::run() void FullyConnectedLayer::prepare() { + if (_bias && _bias->is_constant()) + { + const int bias_size = getTensorShape(_bias).FlatSize(); + if (nnfw::cker::IsZeroVector(reinterpret_cast(_bias->buffer()), bias_size)) + { + _bias = nullptr; + } + } + #ifdef USE_RUY_GEMV +#ifdef EXPERIMENTAL_RUY_FEATURE // TODO This is workaround // The only fc hybrid will use ruy kernel if (_input->data_type() != OperandType::FLOAT32 || @@ -199,6 +227,7 @@ void FullyConnectedLayer::prepare() } } #endif +#endif } } // namespace ops diff --git a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h index dd5ef24..e405b24 100644 --- a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h +++ b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h @@ -72,6 +72,9 @@ private: #ifdef USE_RUY_GEMV uint8_t *_cached_weights = nullptr; // weights to be cached and a key +#ifdef EXPERIMENTAL_RUY_FEATURE + bool _is_weights_freed = false; // is weights freed? +#endif #endif }; diff --git a/runtime/onert/backend/cpu/ops/L2NormLayer.cc b/runtime/onert/backend/cpu/ops/L2NormLayer.cc new file mode 100644 index 0000000..0d99b05 --- /dev/null +++ b/runtime/onert/backend/cpu/ops/L2NormLayer.cc @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "L2NormLayer.h" + +#include "OperationUtils.h" + +#include +#include + +namespace onert +{ +namespace backend +{ +namespace cpu +{ +namespace ops +{ + +void L2NormLayer::configure(const IPortableTensor *input, IPortableTensor *output) +{ + assert(input != nullptr); + assert(output != nullptr); + + _input = input; + _output = output; +} + +void L2NormLayer::run() +{ + switch (_input->data_type()) + { + case OperandType::FLOAT32: + nnfw::cker::L2NormalizeFloat32( + getTensorShape(_input), reinterpret_cast(_input->buffer()), + getTensorShape(_output), reinterpret_cast(_output->buffer())); + break; + + case OperandType::QUANT_UINT8_ASYMM: + { + nnfw::cker::L2NormParams params; + assert(_input->data_offset() == 128); + params.input_zero_point = _input->data_offset(); + nnfw::cker::L2NormalizeQuant8( + params, getTensorShape(_input), reinterpret_cast(_input->buffer()), + getTensorShape(_output), reinterpret_cast(_output->buffer())); + } + break; + + default: + throw std::runtime_error{"L2Norm: Unsupported data type"}; + } +} + +} // namespace ops +} // namespace cpu +} // namespace backend +} // namespace onert diff --git a/runtime/onert/backend/cpu/ops/L2NormLayer.h b/runtime/onert/backend/cpu/ops/L2NormLayer.h new file mode 100644 index 0000000..63f2d11 --- /dev/null +++ b/runtime/onert/backend/cpu/ops/L2NormLayer.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in riting, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_CPU_OPS_L2NORM_LAYER_H__ +#define __ONERT_BACKEND_CPU_OPS_L2NORM_LAYER_H__ + +#include + +#include + +namespace onert +{ +namespace backend +{ +namespace cpu +{ +namespace ops +{ +class L2NormLayer : public ::onert::exec::IFunction +{ +public: + L2NormLayer() : _input(nullptr), _output(nullptr) + { + // Nothing + } + +public: + void configure(const IPortableTensor *_input, IPortableTensor *output); + + void run() override; + +private: + const IPortableTensor *_input; + IPortableTensor *_output; +}; + +} // namespace ops +} // namespace cpu +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_CPU_OPS_L2NORM_LAYER_H__ diff --git a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc index d71e325..06dde4f 100644 --- a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc +++ b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc @@ -49,8 +49,8 @@ void LogSoftMaxLayer::logsoftmaxQuant8() // NYI } -void LogSoftMaxLayer::configure(const Tensor *input, const float beta, const int axis, - Tensor *output) +void LogSoftMaxLayer::configure(const IPortableTensor *input, const float beta, const int axis, + IPortableTensor *output) { _input = input; _output = output; diff --git a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h index bc145ce..ba9deca 100644 --- a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h +++ b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h @@ -40,13 +40,14 @@ public: void logsoftmaxQuant8(); - void configure(const Tensor *input, const float beta, const int axis, Tensor *output); + void configure(const IPortableTensor *input, const float beta, const int axis, + IPortableTensor *output); void run(); private: - const Tensor *_input; - Tensor *_output; + const IPortableTensor *_input; + IPortableTensor *_output; float _beta; int _axis; diff --git a/runtime/onert/backend/cpu/ops/OperationUtils.h b/runtime/onert/backend/cpu/ops/OperationUtils.h index 8d29374..9838552 100644 --- a/runtime/onert/backend/cpu/ops/OperationUtils.h +++ b/runtime/onert/backend/cpu/ops/OperationUtils.h @@ -52,6 +52,17 @@ union DataPtr { void *v; }; +union ConstDataPtr { + const uint8_t *u8; + const int8_t *i8; + const uint32_t *u32; + const int32_t *i32; + const bool *b; + const float *f; + const int64_t *i64; + const void *v; +}; + uint32_t getNumberOfDimensions(const IPortableTensor *tensor); uint32_t getNumberOfElements(const IPortableTensor *tensor); diff --git a/runtime/onert/backend/cpu/ops/PadLayer.cc b/runtime/onert/backend/cpu/ops/PadLayer.cc index fcfcf7b..6a2bf9d 100644 --- a/runtime/onert/backend/cpu/ops/PadLayer.cc +++ b/runtime/onert/backend/cpu/ops/PadLayer.cc @@ -33,33 +33,40 @@ PadLayer::PadLayer() // DO NOTHING } -void PadLayer::padFloat32() +template void PadLayer::padImpl(const T *constant_value_data) { - nnfw::cker::Pad(_padData, _padRank, getTensorShape(_input), - reinterpret_cast(_input->buffer()), getTensorShape(_output), - reinterpret_cast(_output->buffer()), _constantValueData.f); + nnfw::cker::Pad(_padData, _padRank, getTensorShape(_input), + reinterpret_cast(_input->buffer()), getTensorShape(_output), + reinterpret_cast(_output->buffer()), constant_value_data); } -void PadLayer::padQuant8() { throw std::runtime_error("Quantized Pad isn't supported NYI"); } void PadLayer::configure(const IPortableTensor *input, IPortableTensor *output, - const int32_t *padData, int32_t padRank, uint8_t *constantValueData) + const int32_t *padData, int32_t padRank, const void *constantValueData) { _input = input; _output = output; memcpy(_padData, padData, sizeof(_padData)); _padRank = padRank; - _constantValueData.u8 = constantValueData; + _constantValueData.v = constantValueData; } void PadLayer::run() { if (_input->data_type() == OperandType::FLOAT32) { - padFloat32(); + padImpl(_constantValueData.f); } else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM) { - padQuant8(); + if (_constantValueData.u8 == nullptr) + { + uint8_t pad_value = static_cast(_output->data_offset()); + padImpl(&pad_value); + } + else + { + padImpl(_constantValueData.u8); + } } else { diff --git a/runtime/onert/backend/cpu/ops/PadLayer.h b/runtime/onert/backend/cpu/ops/PadLayer.h index 85bd2e6..efd73d5 100644 --- a/runtime/onert/backend/cpu/ops/PadLayer.h +++ b/runtime/onert/backend/cpu/ops/PadLayer.h @@ -39,12 +39,10 @@ public: PadLayer(); public: - void padFloat32(); - - void padQuant8(); + template void padImpl(const T *constant_value_data); void configure(const IPortableTensor *input, IPortableTensor *output, const int32_t *padData, - int32_t padRank, uint8_t *constantValueData = nullptr); + int32_t padRank, const void *constantValueData = nullptr); void run() override; @@ -54,7 +52,7 @@ private: int32_t _padData[8]; int32_t _padRank; - DataPtr _constantValueData; + ConstDataPtr _constantValueData; }; } // namespace ops diff --git a/runtime/onert/backend/cpu/ops/QuantizeLayer.cc b/runtime/onert/backend/cpu/ops/QuantizeLayer.cc new file mode 100644 index 0000000..45fc148 --- /dev/null +++ b/runtime/onert/backend/cpu/ops/QuantizeLayer.cc @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "QuantizeLayer.h" + +#include + +namespace onert +{ +namespace backend +{ +namespace cpu +{ +namespace ops +{ + +QuantizeLayer::QuantizeLayer() : _input(nullptr), _output(nullptr) +{ + // DO NOTHING +} + +template void QuantizeLayer::affineQuantize() +{ + nnfw::cker::Quantize(getTensorShape(_input), reinterpret_cast(_input->buffer()), + getTensorShape(_output), reinterpret_cast(_output->buffer()), + _output->data_scale(), _output->data_offset()); +} + +void QuantizeLayer::configure(const IPortableTensor *input, IPortableTensor *output) +{ + _input = input; + _output = output; +} + +void QuantizeLayer::run() +{ + if (_input->data_type() == OperandType::FLOAT32) + { + affineQuantize(); + } + else + { + throw std::runtime_error{"Quantize: unsupported data type"}; + } +} + +} // namespace ops +} // namespace cpu +} // namespace backend +} // namespace onert diff --git a/runtime/onert/backend/cpu/ops/QuantizeLayer.h b/runtime/onert/backend/cpu/ops/QuantizeLayer.h new file mode 100644 index 0000000..b4e7aca --- /dev/null +++ b/runtime/onert/backend/cpu/ops/QuantizeLayer.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__ +#define __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__ + +#include +#include "OperationUtils.h" + +#include + +namespace onert +{ +namespace backend +{ +namespace cpu +{ +namespace ops +{ + +class QuantizeLayer : public ::onert::exec::IFunction +{ +public: + QuantizeLayer(); + +public: + template void affineQuantize(); + + void configure(const IPortableTensor *input, IPortableTensor *output); + + void run() override; + +private: + const IPortableTensor *_input; + IPortableTensor *_output; +}; + +} // namespace ops +} // namespace cpu +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__ diff --git a/runtime/onert/backend/cpu/ops/SliceLayer.cc b/runtime/onert/backend/cpu/ops/SliceLayer.cc index a9106c1..449c073 100644 --- a/runtime/onert/backend/cpu/ops/SliceLayer.cc +++ b/runtime/onert/backend/cpu/ops/SliceLayer.cc @@ -46,7 +46,7 @@ void SliceLayer::GetBeginAndSizeVectors(int dimensions, const IPortableTensor *b } } -void SliceLayer::sliceFloat32() +template void SliceLayer::sliceImpl() { const int kMaxDim = nnfw::cker::Shape::kMaxSmallSize; @@ -74,14 +74,8 @@ void SliceLayer::sliceFloat32() } nnfw::cker::Slice(op_params, getExtendedTensorShape(_input), - reinterpret_cast(_input->buffer()), - reinterpret_cast(_output->buffer())); -} - -void SliceLayer::sliceQuant8() -{ - // cker quant8 slice is not implemented yet - throw std::runtime_error{"NYI"}; + reinterpret_cast(_input->buffer()), + reinterpret_cast(_output->buffer())); } void SliceLayer::configure(const IPortableTensor *input, const IPortableTensor *begin, @@ -97,11 +91,11 @@ void SliceLayer::run() { if (_input->data_type() == OperandType::FLOAT32) { - sliceFloat32(); + sliceImpl(); } else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM) { - sliceQuant8(); + sliceImpl(); } else { diff --git a/runtime/onert/backend/cpu/ops/SliceLayer.h b/runtime/onert/backend/cpu/ops/SliceLayer.h index 9945d7e..650e2c9 100644 --- a/runtime/onert/backend/cpu/ops/SliceLayer.h +++ b/runtime/onert/backend/cpu/ops/SliceLayer.h @@ -42,8 +42,7 @@ public: void run() override; private: - void sliceFloat32(); - void sliceQuant8(); + template void sliceImpl(); template void GetBeginAndSizeVectors(int dimensions, const IPortableTensor *begin, diff --git a/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc new file mode 100644 index 0000000..110b0bc --- /dev/null +++ b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "SpaceToDepthLayer.h" + +#include "OperationUtils.h" + +#include + +namespace onert +{ +namespace backend +{ +namespace cpu +{ +namespace ops +{ +SpaceToDepthLayer::SpaceToDepthLayer() : _input(nullptr), _block_size(0), _output(nullptr) +{ + // DO NOTHING +} + +template void SpaceToDepthLayer::spaceToDepth() +{ + + nnfw::cker::SpaceToDepthParams params; + params.block_size = _block_size; + + nnfw::cker::SpaceToDepth(params, getTensorShape(_input), + reinterpret_cast(_input->buffer()), + getTensorShape(_output), reinterpret_cast(_output->buffer())); +} + +void SpaceToDepthLayer::configure(const IPortableTensor *input, const int32_t block_size, + IPortableTensor *output) +{ + _input = input; + _block_size = block_size; + _output = output; +} + +void SpaceToDepthLayer::run() +{ + if (_input->data_type() == OperandType::FLOAT32) + { + spaceToDepth(); + } + else + { + throw std::runtime_error{"SpaceToDepth: unsupported data type"}; + } +} + +} // namespace ops +} // namespace cpu +} // namespace backend +} // namespace onert diff --git a/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h new file mode 100644 index 0000000..c11ef2b --- /dev/null +++ b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in riting, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_CPU_OPS_SPACE_TO_DEPTH_LAYER_H__ +#define __ONERT_BACKEND_CPU_OPS_SPACE_TO_DEPTH_LAYER_H__ + +#include + +#include + +namespace onert +{ +namespace backend +{ +namespace cpu +{ +namespace ops +{ +class SpaceToDepthLayer : public ::onert::exec::IFunction +{ +public: + SpaceToDepthLayer(); + + void configure(const IPortableTensor *input, const int32_t block_size, IPortableTensor *output); + + void run() override; + +private: + template void spaceToDepth(); + + const IPortableTensor *_input; + int32_t _block_size; + IPortableTensor *_output; +}; + +} // namespace ops +} // namespace cpu +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_CPU_OPS_SPACE_TO_BATCH_ND_LAYER_H__ diff --git a/runtime/onert/core/include/backend/ITensorBuilder.h b/runtime/onert/core/include/backend/ITensorBuilder.h index a49525b..b760cda 100644 --- a/runtime/onert/core/include/backend/ITensorBuilder.h +++ b/runtime/onert/core/include/backend/ITensorBuilder.h @@ -112,12 +112,12 @@ public: // methods for static tensor allocation virtual std::shared_ptr tensorAt(const ir::OperandIndex &ind) = 0; /** - * @brief Set the External Tensor object + * @brief Set the migrant tensor object * * @return true if succeeded * @return false if failed or unsupported */ - virtual bool setExternalTensor(const ir::OperandIndex &, const std::shared_ptr &) + virtual bool setMigrantTensor(const ir::OperandIndex &, const std::shared_ptr &) { return false; } diff --git a/runtime/onert/core/include/backend/ITensorRegistry.h b/runtime/onert/core/include/backend/ITensorRegistry.h index f5a95f4..8555131 100644 --- a/runtime/onert/core/include/backend/ITensorRegistry.h +++ b/runtime/onert/core/include/backend/ITensorRegistry.h @@ -35,17 +35,22 @@ struct ITensorRegistry virtual ~ITensorRegistry() = default; /** - * @brief Returns pointer of ITensor among managed and external tensors + * @brief Returns pointer of ITensor among native and migrant tensors + * + * Native Tensor is a tensor that is managed by this backend + * Migrant Tensor is a tensor that is imported from another backend + * * @note Return tensor cannot be used longer than dynamic tensor manager */ virtual std::shared_ptr getITensor(const ir::OperandIndex &) = 0; /** - * @brief Returns pointer of ITensor among managed tensors + * @brief Returns pointer of ITensor among native tensors * - * Unlike @c getITensor , this function only searches from managed tensors - * @note Return tensor cannot be used longer than dynamic tensor manager + * Unlike @c getITensor , this function only searches from native tensors + * + * @note Returned tensor cannot be used longer than dynamic tensor manager */ - virtual std::shared_ptr getManagedITensor(const ir::OperandIndex &) = 0; + virtual std::shared_ptr getNativeITensor(const ir::OperandIndex &) = 0; }; } // namespace backend @@ -73,68 +78,67 @@ public: std::shared_ptr getITensor(const ir::OperandIndex &ind) override { static_assert(std::is_base_of::value, "T_Tensor must derive from ITensor."); - auto external_tensor = _external.find(ind); - if (external_tensor != _external.end()) + auto external_tensor = _migrant.find(ind); + if (external_tensor != _migrant.end()) return external_tensor->second; - return getManagedTensor(ind); + return getNativeTensor(ind); } - std::shared_ptr getManagedITensor(const ir::OperandIndex &ind) override + std::shared_ptr getNativeITensor(const ir::OperandIndex &ind) override { - return getManagedTensor(ind); + return getNativeTensor(ind); } std::shared_ptr getPortableTensor(const ir::OperandIndex &ind) { - auto external_tensor = _external.find(ind); - if (external_tensor != _external.end()) + auto external_tensor = _migrant.find(ind); + if (external_tensor != _migrant.end()) { if (external_tensor->second) return external_tensor->second; } - return getManagedTensor(ind); + return getNativeTensor(ind); } - std::shared_ptr getManagedTensor(const ir::OperandIndex &ind) + std::shared_ptr getNativeTensor(const ir::OperandIndex &ind) { - auto tensor = _managed.find(ind); - if (tensor != _managed.end()) + auto tensor = _native.find(ind); + if (tensor != _native.end()) return tensor->second; return nullptr; } - bool setExternalTensor(const ir::OperandIndex &ind, - const std::shared_ptr &tensor) + bool setMigrantTensor(const ir::OperandIndex &ind, const std::shared_ptr &tensor) { // TODO Uncomment this as two tensors for an index is not allowed. // But now it is temporarily allowed as a workaround. External one hides Managed one. - // auto itr = _managed.find(ind); - // if (itr != _managed.end() && itr->second != nullptr && tensor != nullptr) + // auto itr = _native.find(ind); + // if (itr != _native.end() && itr->second != nullptr && tensor != nullptr) // throw std::runtime_error{ - // "Tried to set an external tensor but an managed tensor already exists."}; - _external[ind] = tensor; + // "Tried to set an migrant tensor but an native tensor already exists."}; + _migrant[ind] = tensor; return true; } - void setManagedTensor(const ir::OperandIndex &ind, const std::shared_ptr &tensor) + void setNativeTensor(const ir::OperandIndex &ind, const std::shared_ptr &tensor) { - auto itr = _external.find(ind); - if (itr != _external.end() && itr->second != nullptr && tensor != nullptr) + auto itr = _migrant.find(ind); + if (itr != _migrant.end() && itr->second != nullptr && tensor != nullptr) throw std::runtime_error{ - "Tried to set a managed tensor but an external tensor already exists."}; - _managed[ind] = tensor; + "Tried to set a native tensor but an migrant tensor already exists."}; + _native[ind] = tensor; } - const ir::OperandIndexMap> &managed_tensors() { return _managed; } + const ir::OperandIndexMap> &native_tensors() { return _native; } - const ir::OperandIndexMap> &external_tensors() + const ir::OperandIndexMap> &migrant_tensors() { - return _external; + return _migrant; } private: - ir::OperandIndexMap> _external; - ir::OperandIndexMap> _managed; + ir::OperandIndexMap> _migrant; + ir::OperandIndexMap> _native; }; } // namespace backend diff --git a/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h b/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h index 6ddacc7..a7e034a 100644 --- a/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h +++ b/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h @@ -19,7 +19,7 @@ #include "MemoryManager.h" -#include "backend/ITensorManager.h" +#include "backend/IStaticTensorManager.h" #include "ir/OperandIndexMap.h" #include "ir/OperandInfo.h" #include "TensorRegistry.h" @@ -31,7 +31,7 @@ namespace backend namespace cpu_common { -class StaticTensorManager : public backend::ITensorManager +class StaticTensorManager : public backend::IStaticTensorManager { public: StaticTensorManager(const std::shared_ptr ®); diff --git a/runtime/onert/core/include/compiler/StaticShapeInference.h b/runtime/onert/core/include/compiler/StaticShapeInference.h index 379143b..b3391a3 100644 --- a/runtime/onert/core/include/compiler/StaticShapeInference.h +++ b/runtime/onert/core/include/compiler/StaticShapeInference.h @@ -99,6 +99,7 @@ private: void visit(const ir::operation::LogicalNot &op) override; void visit(const ir::operation::LogicalOr &op) override; void visit(const ir::operation::Logistic &op) override; + void visit(const ir::operation::L2Normalization &op) override; void visit(const ir::operation::MatrixBandPart &op) override; void visit(const ir::operation::Max &op) override; void visit(const ir::operation::Min &op) override; diff --git a/runtime/onert/core/include/exec/DynamicShapeInference.h b/runtime/onert/core/include/exec/DynamicShapeInference.h index 113c348..601c1bf 100644 --- a/runtime/onert/core/include/exec/DynamicShapeInference.h +++ b/runtime/onert/core/include/exec/DynamicShapeInference.h @@ -72,6 +72,7 @@ public: void visit(const ir::operation::LogicalNot &op) override; void visit(const ir::operation::LogicalOr &op) override; void visit(const ir::operation::Logistic &op) override; + void visit(const ir::operation::L2Normalization &op) override; void visit(const ir::operation::MatrixBandPart &op) override; void visit(const ir::operation::Max &op) override; void visit(const ir::operation::Min &op) override; diff --git a/runtime/onert/core/include/ir/Operations.Include.h b/runtime/onert/core/include/ir/Operations.Include.h index 5fac54e..e3b5d19 100644 --- a/runtime/onert/core/include/ir/Operations.Include.h +++ b/runtime/onert/core/include/ir/Operations.Include.h @@ -103,3 +103,4 @@ #include "ir/operation/BatchMatMul.h" #include "ir/operation/FusedBatchNorm.h" #include "ir/operation/LogSoftmax.h" +#include "ir/operation/Quantize.h" diff --git a/runtime/onert/core/include/ir/Operations.lst b/runtime/onert/core/include/ir/Operations.lst index 9d0642f..03a2aa2 100644 --- a/runtime/onert/core/include/ir/Operations.lst +++ b/runtime/onert/core/include/ir/Operations.lst @@ -106,3 +106,4 @@ OP(MatrixBandPart) OP(BatchMatMul) OP(FusedBatchNorm) OP(LogSoftmax) +OP(Quantize) diff --git a/runtime/onert/core/include/ir/operation/LogSoftmax.h b/runtime/onert/core/include/ir/operation/LogSoftmax.h index 26a92d7..391b4ba 100644 --- a/runtime/onert/core/include/ir/operation/LogSoftmax.h +++ b/runtime/onert/core/include/ir/operation/LogSoftmax.h @@ -48,7 +48,7 @@ public: public: void accept(OperationVisitor &v) const override; - OpCode opcode() const final { return OpCode::Softmax; } + OpCode opcode() const final { return OpCode::LogSoftmax; } public: const Param ¶m() const { return _param; } diff --git a/runtime/onert/core/include/ir/operation/Pad.h b/runtime/onert/core/include/ir/operation/Pad.h index a486061..00481cd 100644 --- a/runtime/onert/core/include/ir/operation/Pad.h +++ b/runtime/onert/core/include/ir/operation/Pad.h @@ -33,7 +33,7 @@ public: { INPUT = 0, PAD = 1, - // VALUE = 2 Not allow padding value operand yet + VALUE = 2 }; public: diff --git a/runtime/onert/core/include/ir/operation/Quantize.h b/runtime/onert/core/include/ir/operation/Quantize.h new file mode 100644 index 0000000..2533ce4 --- /dev/null +++ b/runtime/onert/core/include/ir/operation/Quantize.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_IR_OPERATION_QUANTIZE_H__ +#define __ONERT_IR_OPERATION_QUANTIZE_H__ + +#include "ir/Operation.h" + +namespace onert +{ +namespace ir +{ +namespace operation +{ + +class Quantize : public Operation +{ +public: + enum Input + { + INPUT = 0, + }; + +public: + Quantize(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs); + +public: + void accept(OperationVisitor &v) const override; + OpCode opcode() const final { return OpCode::Quantize; } +}; + +} // namespace operation +} // namespace ir +} // namespace onert + +#endif // __ONERT_IR_OPERATION_QUANTIZE_H__ diff --git a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc index 32a8041..c374aba 100644 --- a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc +++ b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc @@ -36,7 +36,7 @@ DynamicTensorManager::DynamicTensorManager(const std::shared_ptrgetManagedTensor(ind); + auto user_tensor = _user_tensors->getNativeTensor(ind); if (user_tensor) { // User tensors cannot be reallocated. @@ -47,8 +47,8 @@ void DynamicTensorManager::applyShape(const ir::OperandIndex &ind, const ir::Sha user_tensor->setShape(new_shape); } - // NOTE Then handle managed tensors - auto tensor = _tensors->getManagedTensor(ind); + // NOTE Then handle native tensors + auto tensor = _tensors->getNativeTensor(ind); assert(tensor); bool previously_dynamic = tensor->is_dynamic(); @@ -101,9 +101,9 @@ void DynamicTensorManager::buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &tensor_info, ir::Layout backend_layout) { - assert(_tensors->getManagedTensor(ind) == nullptr); + assert(_tensors->getNativeTensor(ind) == nullptr); auto tensor = std::make_shared(tensor_info, backend_layout); - _tensors->setManagedTensor(ind, tensor); + _tensors->setNativeTensor(ind, tensor); } void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind) @@ -130,7 +130,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind) auto &input_set = find->second; for (auto input_ind : input_set) { - if (!_tensors->getManagedTensor(input_ind)->is_dynamic()) + if (!_tensors->getNativeTensor(input_ind)->is_dynamic()) continue; _dynamic_mem_mgr->deallocate(input_ind); @@ -141,7 +141,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind) void DynamicTensorManager::deallocSubgraphOutput(ir::OperandIndex output_ind) { - if (!_tensors->getManagedTensor(output_ind)->is_dynamic()) + if (!_tensors->getNativeTensor(output_ind)->is_dynamic()) return; _dynamic_mem_mgr->deallocate(output_ind); diff --git a/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc b/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc index 4b683fb..eb83b7d 100644 --- a/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc +++ b/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc @@ -81,23 +81,23 @@ void KernelGenerator::visit(const ir::operation::If &node) std::vector> input_tensors; for (const auto input_index : node.getInputs()) { - auto input_alloc = getTensor(input_index); + auto input_tensor = getTensor(input_index); - input_tensors.emplace_back(input_alloc); + input_tensors.emplace_back(input_tensor); } std::vector> output_tensors; exec::DynAllocInfoMap outputs_dyn_alloc_info; for (const auto output_index : node.getOutputs()) { - auto output_alloc = getTensor(output_index); + auto output_tensor = getTensor(output_index); - output_tensors.emplace_back(output_alloc); + output_tensors.emplace_back(output_tensor); const auto output_tensor_builder = getTensorBuilder(output_index); if (output_tensor_builder->supportDynamicTensor()) { auto output_dyn_manager = output_tensor_builder->dynamicTensorManager(); - outputs_dyn_alloc_info[output_alloc] = exec::DynAllocInfo{output_index, output_dyn_manager}; + outputs_dyn_alloc_info[output_tensor] = exec::DynAllocInfo{output_index, output_dyn_manager}; } } @@ -146,24 +146,24 @@ void KernelGenerator::visit(const ir::operation::While &node) std::vector> input_tensors; for (const auto input_index : node.getInputs()) { - auto input_alloc = getTensor(input_index); + auto input_tensor = getTensor(input_index); - input_tensors.emplace_back(input_alloc); + input_tensors.emplace_back(input_tensor); } std::vector> output_tensors; std::unordered_map, exec::DynAllocInfo> outputs_dyn_alloc_info; for (const auto output_index : node.getOutputs()) { - auto output_alloc = getTensor(output_index); + auto output_tensor = getTensor(output_index); - output_tensors.emplace_back(output_alloc); + output_tensors.emplace_back(output_tensor); const auto output_tensor_builder = getTensorBuilder(output_index); if (output_tensor_builder->supportDynamicTensor()) { auto output_dyn_manager = output_tensor_builder->dynamicTensorManager(); - outputs_dyn_alloc_info[output_alloc] = exec::DynAllocInfo{output_index, output_dyn_manager}; + outputs_dyn_alloc_info[output_tensor] = exec::DynAllocInfo{output_index, output_dyn_manager}; } } @@ -199,7 +199,7 @@ KernelGenerator::getTensorBuilder(const ir::OperandIndex &index) for (auto tensor_builder : _tensor_builder_set) { auto reg = tensor_builder->tensorRegistry(); - auto tensor = reg ? reg->getManagedITensor(index) : tensor_builder->tensorAt(index); + auto tensor = reg ? reg->getNativeITensor(index) : tensor_builder->tensorAt(index); if (tensor) { ret = tensor_builder; diff --git a/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc b/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc index 16cd3ec..5bddb91 100644 --- a/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc +++ b/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc @@ -92,7 +92,7 @@ void TensorBuilder::allocate() std::shared_ptr TensorBuilder::tensorAt(const ir::OperandIndex &ind) { // NOTE Find from User Tensor Registry first - // FIXME There may be both user tensor and managed tensor for a `ind` which is a waste + // FIXME There may be both user tensor and native tensor for a `ind` which is a waste auto user_tensor = _user_tensor_reg->getITensor(ind); auto tensor = _tensor_reg->getITensor(ind); if (user_tensor) @@ -107,7 +107,7 @@ void TensorBuilder::iterate(const IterateFunction &fn) { _static_tensor_mgr->ite std::shared_ptr TensorBuilder::at(const ir::OperandIndex &ind) { - return _tensor_reg->getManagedTensor(ind); + return _tensor_reg->getNativeTensor(ind); } std::unique_ptr TensorBuilder::releaseStaticTensorManager(void) @@ -123,7 +123,7 @@ std::unique_ptr TensorBuilder::releaseDynamicTensorManager(void) void TensorBuilder::setUserTensor(const ir::OperandIndex &ind, const std::shared_ptr &tensor) { - _user_tensor_reg->setManagedTensor(ind, tensor); + _user_tensor_reg->setNativeTensor(ind, tensor); } } // namespace controlflow diff --git a/runtime/onert/core/src/backend/controlflow/UserTensor.h b/runtime/onert/core/src/backend/controlflow/UserTensor.h index ce94ea0..b9b2d52 100644 --- a/runtime/onert/core/src/backend/controlflow/UserTensor.h +++ b/runtime/onert/core/src/backend/controlflow/UserTensor.h @@ -68,6 +68,7 @@ public: void set_dynamic() override { _dynamic = true; } ir::Shape getShape() const override { return _info.shape(); } void setShape(const ir::Shape &new_shape) override { _info.shape(new_shape); } + bool is_constant() const override { return false; } private: ir::OperandInfo _info; diff --git a/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc b/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc index 0ccf700..ede403b 100644 --- a/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc +++ b/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc @@ -35,7 +35,7 @@ void DynamicTensorManager::applyShape(const ir::OperandIndex &ind, const ir::Sha { VERBOSE_F() << ind << std::endl; - auto tensor = _tensors->getManagedTensor(ind); + auto tensor = _tensors->getNativeTensor(ind); assert(tensor); bool previously_dynamic = tensor->is_dynamic(); @@ -88,9 +88,9 @@ void DynamicTensorManager::buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &tensor_info, ir::Layout backend_layout) { - assert(_tensors->getManagedTensor(ind) == nullptr); + assert(_tensors->getNativeTensor(ind) == nullptr); auto tensor = std::make_shared(tensor_info, backend_layout); - _tensors->setManagedTensor(ind, tensor); + _tensors->setNativeTensor(ind, tensor); } void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind) @@ -117,7 +117,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind) auto &input_set = find->second; for (auto input_ind : input_set) { - auto *tensor = _tensors->getManagedTensor(input_ind).get(); + auto *tensor = _tensors->getNativeTensor(input_ind).get(); if (!tensor->is_dynamic()) continue; @@ -131,7 +131,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind) void DynamicTensorManager::deallocSubgraphOutput(ir::OperandIndex output_ind) { - auto *tensor = _tensors->getManagedTensor(output_ind).get(); + auto *tensor = _tensors->getNativeTensor(output_ind).get(); if (!tensor->is_dynamic()) return; diff --git a/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc b/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc index 47bea35..8604542 100644 --- a/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc +++ b/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc @@ -33,7 +33,7 @@ StaticTensorManager::StaticTensorManager(const std::shared_ptr & void StaticTensorManager::allocateConsts(void) { - for (auto &pair : _tensors->managed_tensors()) + for (auto &pair : _tensors->native_tensors()) { const auto &ind = pair.first; auto tensor = pair.second; @@ -42,9 +42,9 @@ void StaticTensorManager::allocateConsts(void) auto mem_alloc = _const_mgr->allocate(ind, tensor->total_size()); tensor->setBuffer(mem_alloc); auto buffer = mem_alloc->base(); - VERBOSE(CPU_StaticTensorManager) << "CONSTANT TENSOR(#" << ind.value() - << "): " << static_cast(buffer) - << "size : " << tensor->total_size() << std::endl; + VERBOSE(CPU_COMMON_StaticTensorManager) << "CONSTANT TENSOR(#" << ind.value() + << "): " << static_cast(buffer) + << "size : " << tensor->total_size() << std::endl; } } } @@ -53,7 +53,7 @@ void StaticTensorManager::allocateNonconsts(void) { _nonconst_mgr->allocate(); - for (auto &pair : _tensors->managed_tensors()) + for (auto &pair : _tensors->native_tensors()) { const auto &ind = pair.first; auto tensor = pair.second; @@ -62,8 +62,8 @@ void StaticTensorManager::allocateNonconsts(void) auto *buffer = _nonconst_mgr->getBuffer(ind); tensor->setBuffer(buffer); - VERBOSE(CPU_StaticTensorManager) << "TENSOR(#" << ind.value() - << "): " << static_cast(buffer) << std::endl; + VERBOSE(CPU_COMMON_StaticTensorManager) << "TENSOR(#" << ind.value() + << "): " << static_cast(buffer) << std::endl; } } } @@ -76,18 +76,18 @@ void StaticTensorManager::buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &tensor_info, ir::Layout backend_layout, bool as_const) { - assert(!_tensors->getManagedTensor(ind)); + assert(!_tensors->getNativeTensor(ind)); auto tensor = std::make_shared(tensor_info, backend_layout); - _tensors->setManagedTensor(ind, tensor); + _tensors->setNativeTensor(ind, tensor); _as_constants[ind] = as_const; } void StaticTensorManager::claimPlan(const ir::OperandIndex &ind, uint32_t size) { - assert(_tensors->getManagedTensor(ind)); + assert(_tensors->getNativeTensor(ind)); // This method is called only when a tensor has proper shape - assert(!_tensors->getManagedTensor(ind)->is_dynamic()); + assert(!_tensors->getNativeTensor(ind)->is_dynamic()); if (!_as_constants[ind]) _nonconst_mgr->claimPlan(ind, size); @@ -95,10 +95,10 @@ void StaticTensorManager::claimPlan(const ir::OperandIndex &ind, uint32_t size) void StaticTensorManager::releasePlan(const ir::OperandIndex &ind) { - assert(_tensors->getManagedTensor(ind)); + assert(_tensors->getNativeTensor(ind)); // This method is called only when a tensor has proper shape - assert(!_tensors->getManagedTensor(ind)->is_dynamic()); + assert(!_tensors->getNativeTensor(ind)->is_dynamic()); if (!_as_constants[ind]) _nonconst_mgr->releasePlan(ind); @@ -106,7 +106,7 @@ void StaticTensorManager::releasePlan(const ir::OperandIndex &ind) void StaticTensorManager::iterate(const std::function &fn) { - for (const auto &it : _tensors->managed_tensors()) + for (const auto &it : _tensors->native_tensors()) fn(it.first); } diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.cc b/runtime/onert/core/src/compiler/ExecutorFactory.cc index f3f69ad..8439b6a 100644 --- a/runtime/onert/core/src/compiler/ExecutorFactory.cc +++ b/runtime/onert/core/src/compiler/ExecutorFactory.cc @@ -201,18 +201,35 @@ ExecutorFactory::initializeModelIOTensors(ir::LoweredGraph &lowered_graph, // Add tensor to controlflow TensorRegistry. cf_tensor_builder->setUserTensor(ind, tensor); ret.push_back(tensor); - - // Set other tensors as external tensors - for (auto &tensor_builder : tensor_builders) - { - // FIXME This is a workaround registering all user tensors to all backends - // FIXME Handle when it is failed - tensor_builder->setExternalTensor(ind, tensor); - } } return ret; } +void ExecutorFactory::prepareExternalTensors(ir::LoweredGraph &lowered_graph, + TensorBuilders &tensor_builders) +{ + lowered_graph.op_seqs().iterate( + [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) { + auto lower_info = lowered_graph.getLowerInfo(op_seq_index); + auto &backend_ctx = lowered_graph.backend_contexts().at(lower_info->backend()); + for (auto ind : (op_seq.getInputs() + op_seq.getOutputs()) | ir::Remove::DUPLICATED | + ir::Remove::UNDEFINED) + { + // If an OpSequence input/output tensor does not have a own tensor object, + // it must be using external tensors, so find the tensor from other tensor builders and + // set the tensor to this tensor builder if portable + if (!backend_ctx->tensor_builder->tensorAt(ind)) + { + auto tensor = tensor_builders.getITensor(ind); + assert(tensor); // The tensor must have been created in one of TensorBuilders + auto ptensor = std::dynamic_pointer_cast(tensor); + if (ptensor) + backend_ctx->tensor_builder->setMigrantTensor(ind, ptensor); + } + } + }); +} + exec::IExecutor * ExecutorFactory::createLinearExecutor(std::unique_ptr lowered_graph, const compiler::CompilerOptions &options, @@ -265,6 +282,8 @@ ExecutorFactory::createLinearExecutor(std::unique_ptr lowered_ tensor_builder->prepare(); } + prepareExternalTensors(*lowered_graph, tensor_builders); + ExecutionBuilder builder; // Generate kernels @@ -367,6 +386,8 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor( tensor_builder->prepare(); } + prepareExternalTensors(*lowered_graph, tensor_builders); + ExecutionBuilder builder; // Generate kernels diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.h b/runtime/onert/core/src/compiler/ExecutorFactory.h index 1e82b98..418e5a7 100644 --- a/runtime/onert/core/src/compiler/ExecutorFactory.h +++ b/runtime/onert/core/src/compiler/ExecutorFactory.h @@ -22,6 +22,7 @@ #include "backend/ITensor.h" #include "exec/IExecutor.h" #include "ir/LoweredGraph.h" +#include "TensorBuilders.h" namespace onert { @@ -48,6 +49,8 @@ private: static std::vector> initializeModelIOTensors(ir::LoweredGraph &lowered_graph, const ir::OperandIndexSequence &indices); + static void prepareExternalTensors(ir::LoweredGraph &lowered_graph, + TensorBuilders &tensor_builders); static exec::IExecutor * createLinearExecutor(std::unique_ptr lowered_graph, const compiler::CompilerOptions &options, diff --git a/runtime/onert/core/src/compiler/HEScheduler.h b/runtime/onert/core/src/compiler/HEScheduler.h index f507539..d8ceca9 100644 --- a/runtime/onert/core/src/compiler/HEScheduler.h +++ b/runtime/onert/core/src/compiler/HEScheduler.h @@ -51,16 +51,12 @@ public: * @param[in] backend_resolver backend resolver */ HEScheduler(const backend::BackendContexts &backend_contexts, const CompilerOptions &options) - : _backend_contexts{backend_contexts}, _is_supported{}, _backends_avail_time{}, _ops_eft{}, + : _is_supported{}, _backends_avail_time{}, _ops_eft{}, _op_to_rank{std::make_shared>()}, _is_profiling_mode{options.he_profiling_mode}, _is_linear_exec{options.executor == "Linear"}, _is_parallel_exec{options.executor == "Parallel"} { - // Workaround to avoid unused-private-field warning - // TODO use _backend_contexts and remove workaround - (void)_backend_contexts; - for (auto &entry : backend_contexts) { _all_backends.push_back(entry.first); @@ -165,7 +161,6 @@ private: // whether it should assign these backends to these nodes: // * It stores false for unsupported nodes // * During rank calculation with enabled profiling mode it stores true for supported nodes - const backend::BackendContexts &_backend_contexts; std::unordered_map> _is_supported; // Finishing and starting time of each backend std::unordered_map> _backends_avail_time; @@ -175,8 +170,7 @@ private: std::unique_ptr _backend_resolver; std::unique_ptr _exec_time; const ir::Graph *_graph{nullptr}; - std::vector - _all_backends; // TODO Remove this and use _backend_contexts instead + std::vector _all_backends; const backend::Backend *_cpu_backend{nullptr}; // TODO Change this to controlflow_backend bool _is_profiling_mode; bool _is_linear_exec; diff --git a/runtime/onert/core/src/compiler/OperationValidator.cc b/runtime/onert/core/src/compiler/OperationValidator.cc index 5c545ae..fa5ee27 100644 --- a/runtime/onert/core/src/compiler/OperationValidator.cc +++ b/runtime/onert/core/src/compiler/OperationValidator.cc @@ -41,6 +41,21 @@ OperationValidator::OperationValidator(const ir::Graph &graph) { } +void OperationValidator::checkUnaryOp(const ir::Operation &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(0)}; + + // Check if I/O types match + OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type()); + + if (_ctx.at(output_index).info().isDynamic()) + return; + + // Check if I/O shapes match + OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); +} + void OperationValidator::operator()() { // There is no reason for each subgraph to have subgraphs since compiler has subgraphs when @@ -53,16 +68,7 @@ void OperationValidator::operator()() [&](const ir::OperationIndex &, const ir::Operation &node) { node.accept(*this); }); } -void OperationValidator::visit(const ir::operation::Abs &node) -{ - const auto output_index{node.getOutputs().at(0)}; - if (_ctx.at(output_index).info().isDynamic()) - return; - - const auto input_index{node.getInputs().at(0)}; - - OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); -} +void OperationValidator::visit(const ir::operation::Abs &node) { checkUnaryOp(node); } void OperationValidator::visit(const ir::operation::AvgPool2D &node) { @@ -292,17 +298,7 @@ void OperationValidator::visit(const ir::operation::RNN &node) num_units == _ctx.at(hidden_state_out_index).shape().dim(1)); } -void OperationValidator::visit(const ir::operation::Round &node) -{ - const auto output_index{node.getOutputs().at(0)}; - const auto input_index{node.getInputs().at(ir::operation::Round::Input::INPUT)}; - - OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type()); - - if (_ctx.at(output_index).info().isDynamic()) - return; - OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); -} +void OperationValidator::visit(const ir::operation::Round &node) { checkUnaryOp(node); } void OperationValidator::visit(const ir::operation::SpaceToBatchND &node) { @@ -393,17 +389,7 @@ void OperationValidator::visit(const ir::operation::EmbeddingLookup &node) } } -void OperationValidator::visit(const ir::operation::Exp &node) -{ - const auto output_index{node.getOutputs().at(0)}; - const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)}; - - OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type()); - - if (_ctx.at(output_index).info().isDynamic()) - return; - OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); -} +void OperationValidator::visit(const ir::operation::Exp &node) { checkUnaryOp(node); } void OperationValidator::visit(const ir::operation::ExpandDims &node) { @@ -419,17 +405,7 @@ void OperationValidator::visit(const ir::operation::ExpandDims &node) OP_REQUIRES(_ctx.at(axis_index).shape().rank() <= 1); } -void OperationValidator::visit(const ir::operation::Floor &node) -{ - const auto output_index{node.getOutputs().at(0)}; - const auto input_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)}; - - OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type()); - - if (_ctx.at(output_index).info().isDynamic()) - return; - OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); -} +void OperationValidator::visit(const ir::operation::Floor &node) { checkUnaryOp(node); } void OperationValidator::visit(const ir::operation::HashtableLookup &node) { @@ -789,6 +765,25 @@ void OperationValidator::visit(const ir::operation::LSTM &node) } } +void OperationValidator::visit(const ir::operation::L2Normalization &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + if (_ctx.at(ofm_index).info().isDynamic()) + return; + + const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)}; + + auto ifm_shape = _ctx.at(ifm_index).shape(); + auto ofm_shape = _ctx.at(ofm_index).shape(); + + OP_REQUIRES(ifm_shape.rank() == ofm_shape.rank()); + + for (auto i = 0; i < ifm_shape.rank(); i++) + { + OP_REQUIRES(ifm_shape.dim(i) == ofm_shape.dim(i)); + } +} + void OperationValidator::visit(const ir::operation::Unpack &node) { const auto num{node.param().num}; @@ -904,35 +899,11 @@ void OperationValidator::visit(const ir::operation::Split &node) OP_REQUIRES(_ctx.at(input_index).shape().dim(axis) % num_splits == 0); } -void OperationValidator::visit(const ir::operation::Cos &node) -{ - const auto output_index{node.getOutputs().at(0)}; - if (_ctx.at(output_index).info().isDynamic()) - return; - - const auto input_index{node.getInputs().at(0)}; - OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); -} - -void OperationValidator::visit(const ir::operation::Sin &node) -{ - const auto output_index{node.getOutputs().at(0)}; - if (_ctx.at(output_index).info().isDynamic()) - return; +void OperationValidator::visit(const ir::operation::Cos &node) { checkUnaryOp(node); } - const auto input_index{node.getInputs().at(0)}; - OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); -} +void OperationValidator::visit(const ir::operation::Sin &node) { checkUnaryOp(node); } -void OperationValidator::visit(const ir::operation::RSQRT &node) -{ - const auto output_index{node.getOutputs().at(0)}; - if (_ctx.at(output_index).info().isDynamic()) - return; - - const auto input_index{node.getInputs().at(0)}; - OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); -} +void OperationValidator::visit(const ir::operation::RSQRT &node) { checkUnaryOp(node); } void OperationValidator::visit(const ir::operation::Shape &node) { @@ -972,35 +943,11 @@ void OperationValidator::visit(const ir::operation::While &node) // TODO Add to validate with subgraphs } -void OperationValidator::visit(const ir::operation::Neg &node) -{ - const auto output_index{node.getOutputs().at(0)}; - if (_ctx.at(output_index).info().isDynamic()) - return; +void OperationValidator::visit(const ir::operation::Neg &node) { checkUnaryOp(node); } - const auto input_index{node.getInputs().at(0)}; - OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); -} +void OperationValidator::visit(const ir::operation::Log &node) { checkUnaryOp(node); } -void OperationValidator::visit(const ir::operation::Log &node) -{ - const auto output_index{node.getOutputs().at(0)}; - if (_ctx.at(output_index).info().isDynamic()) - return; - - const auto input_index{node.getInputs().at(0)}; - OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); -} - -void OperationValidator::visit(const ir::operation::LogicalNot &node) -{ - const auto output_index{node.getOutputs().at(0)}; - if (_ctx.at(output_index).info().isDynamic()) - return; - - const auto input_index{node.getInputs().at(0)}; - OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); -} +void OperationValidator::visit(const ir::operation::LogicalNot &node) { checkUnaryOp(node); } void OperationValidator::visit(const ir::operation::SquaredDifference &node) { @@ -1118,5 +1065,25 @@ void OperationValidator::visit(const ir::operation::LogSoftmax &node) OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank()); } + +void OperationValidator::visit(const ir::operation::Quantize &node) +{ + VERBOSE(Quantize) << "Configure Quantize operation" << std::endl; + + OP_REQUIRES(node.getInputs().size() == 1); + OP_REQUIRES(node.getOutputs().size() == 1); + + const auto input_index{node.getInputs().at(0)}; + const auto output_index{node.getOutputs().at(0)}; + + OP_REQUIRES(_ctx.at(input_index).typeInfo().type() == ir::DataType::FLOAT32); + + if (_ctx.at(output_index).info().isDynamic()) + return; + + OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM); + + OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank()); +} } // namespace compiler } // namespace onert diff --git a/runtime/onert/core/src/compiler/OperationValidator.h b/runtime/onert/core/src/compiler/OperationValidator.h index 6ceafe8..55a4dd5 100644 --- a/runtime/onert/core/src/compiler/OperationValidator.h +++ b/runtime/onert/core/src/compiler/OperationValidator.h @@ -70,6 +70,7 @@ public: void visit(const ir::operation::DepthToSpace &node) override; void visit(const ir::operation::Pack &node) override; void visit(const ir::operation::LSTM &node) override; + void visit(const ir::operation::L2Normalization &node) override; void visit(const ir::operation::Unpack &node) override; void visit(const ir::operation::Pad &node) override; void visit(const ir::operation::Min &node) override; @@ -93,9 +94,10 @@ public: void visit(const ir::operation::Range &node) override; void visit(const ir::operation::MatrixBandPart &node) override; void visit(const ir::operation::LogSoftmax &node) override; + void visit(const ir::operation::Quantize &node) override; private: - void checkReduceOp(const ir::OperandIndex input_index, const ir::OperandIndex output_index); + void checkUnaryOp(const ir::Operation &node); private: // TODO Remove _ctx field diff --git a/runtime/onert/core/src/compiler/StaticShapeInference.cc b/runtime/onert/core/src/compiler/StaticShapeInference.cc index 5a58f2e..66de599 100644 --- a/runtime/onert/core/src/compiler/StaticShapeInference.cc +++ b/runtime/onert/core/src/compiler/StaticShapeInference.cc @@ -497,6 +497,11 @@ void StaticShapeInferer::visit(const ir::operation::Logistic &op) handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::Logistic::Input::INPUT)); } +void StaticShapeInferer::visit(const ir::operation::L2Normalization &op) +{ + handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::L2Normalization::Input::INPUT)); +} + void StaticShapeInferer::visit(const ir::operation::MatrixBandPart &op) { handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::MatrixBandPart::Input::INPUT)); diff --git a/runtime/onert/core/src/compiler/TensorBuilders.h b/runtime/onert/core/src/compiler/TensorBuilders.h index 4bb7413..c0a1ebc 100644 --- a/runtime/onert/core/src/compiler/TensorBuilders.h +++ b/runtime/onert/core/src/compiler/TensorBuilders.h @@ -23,6 +23,7 @@ #include "backend/Backend.h" #include "backend/controlflow/Config.h" #include "backend/controlflow/TensorBuilder.h" +#include "util/logging.h" namespace onert { @@ -66,6 +67,17 @@ public: return _cf_tensor_builder; } + std::shared_ptr getITensor(ir::OperandIndex ind) + { + for (auto &tensor_builder : _tensor_builders) + { + auto tensor = tensor_builder->tensorAt(ind); + if (tensor) + return tensor; + } + return nullptr; + } + private: std::unordered_set> _tensor_builders; std::shared_ptr _cf_tensor_builder; diff --git a/runtime/onert/core/src/exec/DynamicShapeInference.cc b/runtime/onert/core/src/exec/DynamicShapeInference.cc index 1b82029..28e92ba 100644 --- a/runtime/onert/core/src/exec/DynamicShapeInference.cc +++ b/runtime/onert/core/src/exec/DynamicShapeInference.cc @@ -442,6 +442,11 @@ void DynamicShapeInferer::visit(const ir::operation::Logistic &op) handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::Logistic::INPUT)); } +void DynamicShapeInferer::visit(const ir::operation::L2Normalization &op) +{ + handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::L2Normalization::INPUT)); +} + void DynamicShapeInferer::visit(const ir::operation::MatrixBandPart &op) { handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::MatrixBandPart::INPUT)); diff --git a/runtime/onert/core/src/exec/ExecutorBase.cc b/runtime/onert/core/src/exec/ExecutorBase.cc index a7409b9..864ccb3 100644 --- a/runtime/onert/core/src/exec/ExecutorBase.cc +++ b/runtime/onert/core/src/exec/ExecutorBase.cc @@ -46,7 +46,7 @@ ExecutorBase::ExecutorBase(std::unique_ptr &&lowered_graph, { auto tensor_registry = tensor_builder->tensorRegistry(); assert(tensor_registry); - tensor = tensor_registry->getManagedITensor(ind); + tensor = tensor_registry->getNativeITensor(ind); if (tensor != nullptr) { if (tensor_builder->supportDynamicTensor()) @@ -71,7 +71,7 @@ ExecutorBase::ExecutorBase(std::unique_ptr &&lowered_graph, { auto tensor_registry = tensor_builder->tensorRegistry(); assert(tensor_registry); - tensor = tensor_registry->getManagedITensor(ind); + tensor = tensor_registry->getNativeITensor(ind); if (tensor != nullptr) { if (tensor_builder->supportDynamicTensor()) diff --git a/runtime/onert/core/src/interp/operations/Pad.cc b/runtime/onert/core/src/interp/operations/Pad.cc index d2e3627..c8dce69 100644 --- a/runtime/onert/core/src/interp/operations/Pad.cc +++ b/runtime/onert/core/src/interp/operations/Pad.cc @@ -69,8 +69,8 @@ void invoke(const ITensor *input_tensor, const ITensor *pad_tensor, const ITenso const int32_t *pad_ptr = reinterpret_cast(pad_buffer); float *output_ptr = reinterpret_cast(output_buffer); - nnfw::cker::Pad(pad_ptr, pad_rank, cker_input_shape, input_ptr, cker_output_shape, output_ptr, - nullptr); + nnfw::cker::Pad(pad_ptr, pad_rank, cker_input_shape, input_ptr, cker_output_shape, + output_ptr, nullptr); } void invokePad(const ExecEnv *env, const ir::Operation &node) diff --git a/runtime/onert/core/src/ir/LoweredGraph.cc b/runtime/onert/core/src/ir/LoweredGraph.cc index 6e93a23..f138089 100644 --- a/runtime/onert/core/src/ir/LoweredGraph.cc +++ b/runtime/onert/core/src/ir/LoweredGraph.cc @@ -122,9 +122,6 @@ LoweredGraph::LoweredGraph(const Graph &graph, const compiler::CompilerOptions & pass::PermutationInsertionPass pi_pass(*this); pi_pass.run(); - // Implemented code no longer works. - // pass::PermutationEliminationPass pe_pass(*this); - // pe_pass.run(); _op_seqs.dump("merged and sorted operations with permutation", _graph.operations()); } diff --git a/runtime/onert/core/src/ir/operation/Quantize.cc b/runtime/onert/core/src/ir/operation/Quantize.cc new file mode 100644 index 0000000..0e3d5b6 --- /dev/null +++ b/runtime/onert/core/src/ir/operation/Quantize.cc @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ir/operation/Quantize.h" + +#include "ir/OperationVisitor.h" + +namespace onert +{ +namespace ir +{ +namespace operation +{ + +void Quantize::accept(OperationVisitor &v) const { v.visit(*this); } + +Quantize::Quantize(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs) + : Operation{OperandConstraint::createExact(2u), inputs, outputs} +{ +} + +} // namespace operation +} // namespace ir +} // namespace onert diff --git a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.cc b/runtime/onert/core/src/ir/pass/PermutationEliminationPass.cc deleted file mode 100644 index 9e0291e..0000000 --- a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.cc +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "PermutationEliminationPass.h" - -#include "ir/Operand.h" -#include "ir/operand/LowerInfo.h" -#include "ir/Graph.h" -#include "backend/IConfig.h" -#include "util/logging.h" - -namespace onert -{ -namespace ir -{ -namespace pass -{ -void PermutationEliminationPass::callback(const OperandIndex &inp_index, Operand &object) -{ - if (_graph.getInputs().contains(inp_index)) - { - eliminateInput(inp_index, object); - } - else if (_graph.getOutputs().contains(inp_index)) - { - eliminateOutput(inp_index, object); - } -} - -void PermutationEliminationPass::eliminateInput(const OperandIndex &inp_index, Operand &object) -{ - auto &model_inputs = _graph.getInputs(); - - // get uses of the model's given input - auto uses = object.getUses(); - - // input must be used just by permutation - if (uses.size() != 1) - { - return; - } - - for (auto input_use : uses) - { - auto &perm_operation = _graph.operations().at(input_use); - auto perm_inputs = perm_operation.getInputs(); - - auto perm_outputs = perm_operation.getOutputs(); - - if (!isPermuteLayerToEliminate(perm_inputs, perm_outputs, true)) - { - return; - } - - assert(perm_inputs.at(0) == inp_index); - - VERBOSE(PermutationEliminationPass::EliminateInput) << "remove NHWC_TO_NCHW permutation\n"; - - // set model's new input, which was output of permutation - model_inputs.replace(inp_index, perm_outputs.at(0)); - - // remove model's input, which is also input of permutation - _graph.removeOperand(inp_index); - - // remove permutation operation - assert(_lowered_graph.op_seqs().containsOperation(input_use)); - auto op_seq_idx = _lowered_graph.op_seqs().getOperation(input_use); - _lowered_graph.op_seqs().remove(op_seq_idx); - _graph.operations().remove(input_use); - - VERBOSE(PermutationEliminationPass::EliminateInput) - << inp_index.value() << " is model's input and is removed. New input is " - << perm_outputs.at(0).value() << "\n" - << input_use.value() << " is removed permutation operation\n"; - } -} - -void PermutationEliminationPass::eliminateOutput(const OperandIndex &out_index, Operand &object) -{ - auto &model_outputs = _graph.getOutputs(); - - // get defs of the model's given output - auto defs = object.getDef(); - - // output must use just permutation - if (defs.size() != 1) - { - return; - } - - for (auto output_def : defs) - { - auto &perm_operation = _graph.operations().at(output_def); - auto perm_outputs = perm_operation.getOutputs(); - - auto perm_inputs = perm_operation.getInputs(); - if (!isPermuteLayerToEliminate(perm_inputs, perm_outputs, false)) - { - return; - } - - assert(perm_outputs.at(0) == out_index); - - VERBOSE(PermutationEliminationPass::EliminateOutput) << "remove NCHW_TO_NHWC permutation\n"; - - // Update operations' output that is used by permute operand - for (auto perm_input_index : perm_inputs) - { - auto &perm_input_operand = _graph.operands().at(perm_input_index); - perm_input_operand.removeUse(output_def); - } - - // set model's new output, which was input of permutation - model_outputs.replace(out_index, perm_inputs.at(0)); - - // remove model's output, which is also output of permutation - _graph.removeOperand(out_index); - - // remove permutation operation - assert(_lowered_graph.op_seqs().containsOperation(output_def)); - auto op_seq_idx = _lowered_graph.op_seqs().getOperation(output_def); - _lowered_graph.op_seqs().remove(op_seq_idx); - _graph.operations().remove(output_def); - - VERBOSE(PermutationEliminationPass::EliminateOutput) - << out_index.value() << " is model's output and is removed. New output is " - << perm_inputs.at(0).value() << "\n" - << output_def.value() << " is removed permutation operation\n"; - } -} - -bool PermutationEliminationPass::isPermuteLayerToEliminate(const OperandIndexSequence &inp_indexes, - const OperandIndexSequence &out_indexes, - bool is_for_model_input) -{ - auto input_def_factors = _lowered_graph.getLowerInfo(inp_indexes.at(0))->def_factors(); - auto output_def_factors = _lowered_graph.getLowerInfo(out_indexes.at(0))->def_factors(); - - auto input_layout = input_def_factors.getOnlyElement().layout(); - auto output_layout = output_def_factors.getOnlyElement().layout(); - - if (input_def_factors.size() != 1 || output_def_factors.size() != 1) - { - return false; - } - - // all operands' factor must be the same - for (auto index : inp_indexes) - { - auto op_factor_set = _lowered_graph.getLowerInfo(index)->def_factors(); - if (op_factor_set.size() != 1 || - input_layout != _lowered_graph.getLowerInfo(index)->def_factors().getOnlyElement().layout()) - { - return false; - } - } - // all operands' factor must be the same - for (auto index : out_indexes) - { - auto op_factor_set = _lowered_graph.getLowerInfo(index)->def_factors(); - if (op_factor_set.size() != 1 || - output_layout != - _lowered_graph.getLowerInfo(index)->def_factors().getOnlyElement().layout()) - { - return false; - } - } - - if (is_for_model_input) - { - // check if this is NHWC_TO_NCHW permutation: must have single input, which is model's input - return (inp_indexes.size() == 1 && input_layout == Layout::NHWC && - output_layout == Layout::NCHW); - } - - // check if this is NCHW_TO_NHWC permutation: must have single output, which is model's output - return (out_indexes.size() == 1 && input_layout == Layout::NCHW && output_layout == Layout::NHWC); -} - -} // namespace pass -} // namespace ir -} // namespace onert diff --git a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.h b/runtime/onert/core/src/ir/pass/PermutationEliminationPass.h deleted file mode 100644 index 1c84300..0000000 --- a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.h +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_GRAPH_PASS_PERMUTATION_ELIMINATION_PASS_H__ -#define __ONERT_GRAPH_PASS_PERMUTATION_ELIMINATION_PASS_H__ - -#include "LoweredOperandPass.h" -#include "ir/Operand.h" -#include "ir/OperandIndexSequence.h" - -namespace onert -{ -namespace ir -{ -namespace pass -{ - -class PermutationEliminationPass : public LoweredOperandPass -{ -public: - using LoweredOperandPass::LoweredOperandPass; - -public: - std::string id() override { return "PermutationEliminationPass"; } - - void callback(const OperandIndex &index, Operand &object) override; - -private: - /** - * @brief Remove Permute operation that permutates input - * - * Note: This function aslo removes model's input and - * sets output of permutation as model's new input - * - * @param inp_index is the target operand index for the elimination - * @param object is the target operand object for the elimination - * - * @return - */ - void eliminateInput(const OperandIndex &inp_index, Operand &object); - - /** - * @brief Remove Permute operation that permutates output of a model - * - * Note: This function aslo removes model's output and - * sets input of permutation as model's new output - * - * @param out_index is the target operand index for the elimination - * @param object is the target operand object for the elimination - * - * @return - */ - void eliminateOutput(const OperandIndex &out_index, Operand &object); - - /** - * @brief Determine if passed operands are permute layer's input and output, that must be - * eliminated - * - * @param inp_index indexes of the input operand to operation - * @param out_index indexes of the output operand to operation - * @param is_for_model_input checking for model's input or output - * - * @return if it is permutation layer - */ - bool isPermuteLayerToEliminate(const OperandIndexSequence &inp_indexes, - const OperandIndexSequence &out_indexes, bool is_for_model_input); -}; - -} // namespace pass -} // namespace ir -} // namespace onert - -#endif // __ONERT_GRAPH_PASS_PERMUTATION_ELIMINATION_PASS_H__ diff --git a/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc b/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc index 7c3da52..75efdd8 100644 --- a/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc +++ b/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc @@ -62,27 +62,26 @@ void PermutationInsertionPass::callback(const OperandIndex &index, Operand &obje auto insert_set = operand_li->use_factors() - operand_li->def_factors(); auto def_factor = operand_li->def_factors().getOnlyElement(); - auto compatible_backends = [](auto /* backend1 */, auto /* backend2 */) { - // TODO If other issues for Permute elimination are resolved, enable this - return false; - /* + auto compatible_backends = [](auto backend1, auto backend2) { // TODO This is a workaround for not inserting Permute between cpu and controlflow. // To be general, we need another way of checking they are compatible. const auto cf = backend::controlflow::Config::ID; const auto cpu = "cpu"; const auto id1 = backend1->config()->id(); const auto id2 = backend2->config()->id(); - return (id1 == cpu && id2 == cf) // Allows no-Permute for Model inputs - || (id1 == cf && id2 == cpu); // Allows no-Permute for Model outputs - */ + // NOTE This is to skip Permute insertion for model inputs(controlflow -> cpu), but not + // outputs. This function currently assumes that backend1 is Def and backend2 is Use. However + // it is going to be fixed soon. + // TODO make both ways work + return (id1 == cpu && id2 == cf); }; for (auto factor : insert_set) { + // Check exceptional cases that Permute ops are not inserted if (factor.layout() == def_factor.layout() && compatible_backends(factor.backend(), def_factor.backend())) { - // For this factor we can just reuse existing operand - Permute is not added. VERBOSE(PermutationInsertionPass) << "Permutation Insertion is skipped for operand " << index << " / as the tensor is compatible with backend " << factor.backend()->config()->id() << std::endl; diff --git a/runtime/onert/frontend/base_loader/include/base_loader.h b/runtime/onert/frontend/base_loader/include/base_loader.h index f5687ad..f763346 100644 --- a/runtime/onert/frontend/base_loader/include/base_loader.h +++ b/runtime/onert/frontend/base_loader/include/base_loader.h @@ -171,6 +171,8 @@ protected: void loadBroadcastTo(const Operator *op, ir::Graph &subg); void loadFusedBatchNorm(const Operator *op, ir::Graph &subg); void loadLogSoftmax(const Operator *op, ir::Graph &subg); + void loadQuantize(const Operator *op, ir::Graph &subg); + void loadSpaceToDepth(const Operator *op, ir::Graph &subg); protected: // Base address for mapped region for loading (if needed) @@ -1123,6 +1125,22 @@ void BaseLoader::loadBroadcastTo(const Operator *o std::unique_ptr new_op(new ir::operation::BroadcastTo(inputs, outputs)); subg.addOperation(std::move(new_op)); } +template +void BaseLoader::loadSpaceToDepth(const Operator *op, ir::Graph &subg) +{ + ir::OperandIndexSequence inputs; + ir::OperandIndexSequence outputs; + ir::operation::SpaceToDepth::Param param; + + const auto *options = op->builtin_options_as_SpaceToDepthOptions(); + + param.block_size = options->block_size(); + + loadOperationIO(op, inputs, outputs); + + std::unique_ptr new_op(new ir::operation::SpaceToDepth(inputs, outputs, param)); + subg.addOperation(std::move(new_op)); +} template void BaseLoader::loadCustom(const Operator *op, ir::Graph &subg) @@ -1743,6 +1761,18 @@ void BaseLoader::loadLogSoftmax(const Operator *op } template +void BaseLoader::loadQuantize(const Operator *op, ir::Graph &subg) +{ + ir::OperandIndexSequence inputs; + ir::OperandIndexSequence outputs; + + loadOperationIO(op, inputs, outputs); + + std::unique_ptr new_op(new ir::operation::Quantize(inputs, outputs)); + subg.addOperation(std::move(new_op)); +} + +template void BaseLoader::loadOperation(const Operator *op, ir::Graph &subg) { const auto builtin_op = _model->operator_codes()->Get(op->opcode_index())->builtin_code(); @@ -1959,6 +1989,12 @@ void BaseLoader::loadOperation(const Operator *op, case BuiltinOperator::BuiltinOperator_LOG_SOFTMAX: loadLogSoftmax(op, subg); return; + case BuiltinOperator::BuiltinOperator_QUANTIZE: + loadQuantize(op, subg); + return; + case BuiltinOperator::BuiltinOperator_SPACE_TO_DEPTH: + loadSpaceToDepth(op, subg); + return; default: throw std::runtime_error( std::string("Unsupported operation: ").append(EnumNameBuiltinOperator(builtin_op))); diff --git a/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc b/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc index 94791f8..00ffcb6 100644 --- a/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc +++ b/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc @@ -106,6 +106,33 @@ getReduceGenerator(const onert::ir::operation::Reduce::ReduceType reduce_type) }; } +template +Operation *CreateSimpleUnaryOp(const OperationFactory::Param &init_param, Operands &) +{ + assert(init_param.input_count == 1 && init_param.output_count == 1); + + OperandIndexSequence outputs{init_param.outputs[0]}; + + // Each input should be interpreted as follows: + // + // 0 -> Input Tensor Index + OperandIndexSequence inputs{init_param.inputs[0]}; + + return new T{inputs, outputs}; +} + +// A generator function for binary ops with no params +template +Operation *createSimpleBinaryOp(const OperationFactory::Param &init_param, Operands &) +{ + assert(init_param.input_count == 2 && init_param.output_count == 1); + + OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]}; + OperandIndexSequence outputs{init_param.outputs[0]}; + + return new T{inputs, outputs}; +} + } // namespace OperationFactory &OperationFactory::get() @@ -116,20 +143,10 @@ OperationFactory &OperationFactory::get() OperationFactory::OperationFactory() { - _map[ANEURALNETWORKS_BATCH_TO_SPACE_ND] = [](const OperationFactory::Param &init_param, - Operands &) { - assert(init_param.input_count == 2 && init_param.output_count == 1); - - OperandIndexSequence outputs{init_param.outputs[0]}; - - // Each input should be interpreted as follows: - // - // 0 -> Input Tensor Index - // 1 -> Block size Index - OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]}; - - return new operation::BatchToSpaceND{inputs, outputs}; - }; + // Each input should be interpreted as follows: + // 0 -> Input Tensor Index + // 1 -> Block size Index + _map[ANEURALNETWORKS_BATCH_TO_SPACE_ND] = createSimpleBinaryOp; _map[ANEURALNETWORKS_DEPTHWISE_CONV_2D] = [](const OperationFactory::Param &init_param, Operands &operands) { @@ -724,44 +741,11 @@ OperationFactory::OperationFactory() return new operation::Squeeze{inputs, outputs, param}; }; - _map[ANEURALNETWORKS_TANH] = [](const OperationFactory::Param &init_param, Operands &) { - assert(init_param.input_count == 1 && init_param.output_count == 1); - - OperandIndexSequence outputs{init_param.outputs[0]}; - - // Each input should be interpreted as follows: - // - // 0 -> Input Tensor Index - OperandIndexSequence inputs{init_param.inputs[0]}; - - return new operation::Tanh{inputs, outputs}; - }; - - _map[ANEURALNETWORKS_LOG] = [](const OperationFactory::Param &init_param, Operands &) { - assert(init_param.input_count == 1 && init_param.output_count == 1); + _map[ANEURALNETWORKS_TANH] = CreateSimpleUnaryOp; - OperandIndexSequence outputs{init_param.outputs[0]}; + _map[ANEURALNETWORKS_LOG] = CreateSimpleUnaryOp; - // Each input should be interpreted as follows: - // - // 0 -> Input Tensor Index - OperandIndexSequence inputs{init_param.inputs[0]}; - - return new operation::Log{inputs, outputs}; - }; - - _map[ANEURALNETWORKS_LOGISTIC] = [](const OperationFactory::Param &init_param, Operands &) { - assert(init_param.input_count == 1 && init_param.output_count == 1); - - OperandIndexSequence outputs{init_param.outputs[0]}; - - // Each input should be interpreted as follows: - // - // 0 -> Input Tensor Index - OperandIndexSequence inputs{init_param.inputs[0]}; - - return new operation::Logistic{inputs, outputs}; - }; + _map[ANEURALNETWORKS_LOGISTIC] = CreateSimpleUnaryOp; _map[ANEURALNETWORKS_DIV] = [](const OperationFactory::Param &init_param, Operands &operands) { assert(init_param.input_count == 3 && init_param.output_count == 1); @@ -784,36 +768,16 @@ OperationFactory::OperationFactory() return new operation::Div{inputs, outputs, param}; }; - _map[ANEURALNETWORKS_EXP] = [](const OperationFactory::Param &init_param, Operands &) { - assert(init_param.input_count == 1 && init_param.output_count == 1); - - OperandIndexSequence outputs{init_param.outputs[0]}; - - // Each input should be interpreted as follows: - // - // 0 -> Input Tensor Index - OperandIndexSequence inputs{init_param.inputs[0]}; - - return new operation::Exp{inputs, outputs}; - }; + _map[ANEURALNETWORKS_EXP] = CreateSimpleUnaryOp; // ANEURALNETWORKS_EXP_EX is deprecated // TODO Remove ANEURALNETWORKS_EXP_EX _map[ANEURALNETWORKS_EXP_EX] = _map[ANEURALNETWORKS_EXP]; - _map[ANEURALNETWORKS_EXPAND_DIMS] = [](const OperationFactory::Param &init_param, Operands &) { - assert(init_param.input_count == 2 && init_param.output_count == 1); - - OperandIndexSequence outputs{init_param.outputs[0]}; - - // Each input should be interpreted as follows: - // - // 0 -> Input Tensor Index - // 1 -> Axis Tensor Index - OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]}; - - return new operation::ExpandDims{inputs, outputs}; - }; + // Each input should be interpreted as follows: + // 0 -> Input Tensor Index + // 1 -> Axis Tensor Index + _map[ANEURALNETWORKS_EXPAND_DIMS] = createSimpleBinaryOp; _map[ANEURALNETWORKS_GREATER] = [](const OperationFactory::Param &init_param, Operands &) { assert(init_param.input_count == 2 && init_param.output_count == 1); @@ -982,19 +946,7 @@ OperationFactory::OperationFactory() return new operation::Comparison{inputs, outputs, param}; }; - _map[ANEURALNETWORKS_LOGICAL_AND] = [](const OperationFactory::Param &init_param, Operands &) { - assert(init_param.input_count == 2 && init_param.output_count == 1); - - OperandIndexSequence outputs{init_param.outputs[0]}; - - // Each input should be interpreted as follows: - // - // 0 -> input0 Tensor Index - // 1 -> input1 Tensor Index - OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]}; - - return new operation::LogicalAnd{inputs, outputs}; - }; + _map[ANEURALNETWORKS_LOGICAL_AND] = createSimpleBinaryOp; // ANEURALNETWORKS_LOGICAL_AND_EX is deprecated // TODO Remove ANEURALNETWORKS_LOGICAL_AND_EX @@ -1018,18 +970,7 @@ OperationFactory::OperationFactory() return new operation::LogicalAnd{inputs, outputs}; }; - _map[ANEURALNETWORKS_RSQRT] = [](const OperationFactory::Param &init_param, Operands &) { - assert(init_param.input_count == 1 && init_param.output_count == 1); - - OperandIndexSequence outputs{init_param.outputs[0]}; - - // Each input should be interpreted as follows: - // - // 0 -> Input Tensor Index - OperandIndexSequence inputs{init_param.inputs[0]}; - - return new operation::RSQRT{inputs, outputs}; - }; + _map[ANEURALNETWORKS_RSQRT] = CreateSimpleUnaryOp; _map[ANEURALNETWORKS_SELECT] = [](const OperationFactory::Param &init_param, Operands &) { assert(init_param.input_count == 3 && init_param.output_count == 1); @@ -1065,18 +1006,7 @@ OperationFactory::OperationFactory() // TODO Remove ANEURALNETWORKS_RSQRT_EX _map[ANEURALNETWORKS_RSQRT_EX] = _map[ANEURALNETWORKS_RSQRT]; - _map[ANEURALNETWORKS_RELU] = [](const OperationFactory::Param &init_param, Operands &) { - assert(init_param.input_count == 1 && init_param.output_count == 1); - - OperandIndexSequence outputs{init_param.outputs[0]}; - - // Each input should be interpreted as follows: - // - // 0 -> Input Tensor Index - OperandIndexSequence inputs{init_param.inputs[0]}; - - return new operation::ReLU{inputs, outputs}; - }; + _map[ANEURALNETWORKS_RELU] = CreateSimpleUnaryOp; _map[ANEURALNETWORKS_RESIZE_BILINEAR] = [](const OperationFactory::Param &init_param, Operands &operands) { @@ -1098,31 +1028,9 @@ OperationFactory::OperationFactory() return new operation::ResizeBilinear{inputs, outputs, param}; }; - _map[ANEURALNETWORKS_RELU1] = [](const OperationFactory::Param &init_param, Operands &) { - assert(init_param.input_count == 1 && init_param.output_count == 1); - - OperandIndexSequence outputs{init_param.outputs[0]}; - - // Each input should be interpreted as follows: - // - // 0 -> input Tensor Index - OperandIndexSequence inputs{init_param.inputs[0]}; + _map[ANEURALNETWORKS_RELU1] = CreateSimpleUnaryOp; - return new operation::ReLU1{inputs, outputs}; - }; - - _map[ANEURALNETWORKS_RELU6] = [](const OperationFactory::Param &init_param, Operands &) { - assert(init_param.input_count == 1 && init_param.output_count == 1); - - OperandIndexSequence outputs{init_param.outputs[0]}; - - // Each input should be interpreted as follows: - // - // 0 -> input Tensor Index - OperandIndexSequence inputs{init_param.inputs[0]}; - - return new operation::ReLU6{inputs, outputs}; - }; + _map[ANEURALNETWORKS_RELU6] = CreateSimpleUnaryOp; _map[ANEURALNETWORKS_REVERSE_EX] = [](const OperationFactory::Param &init_param, Operands &) { assert(init_param.input_count == 2 && init_param.output_count == 1); @@ -1438,18 +1346,7 @@ OperationFactory::OperationFactory() return new operation::LogicalOr{inputs, outputs}; }; - _map[ANEURALNETWORKS_LOGICAL_NOT] = [](const OperationFactory::Param &init_param, Operands &) { - assert(init_param.input_count == 1 && init_param.output_count == 1); - - OperandIndexSequence outputs{init_param.outputs[0]}; - - // Each input should be interpreted as follows: - // - // 0 -> input Tensor Index - OperandIndexSequence inputs{init_param.inputs[0]}; - - return new operation::LogicalNot{inputs, outputs}; - }; + _map[ANEURALNETWORKS_LOGICAL_NOT] = CreateSimpleUnaryOp; // ANEURALNETWORKS_LOGICAL_NOT_EX is deprecated // TODO Remove ANEURALNETWORKS_LOGICAL_NOT_EX @@ -1649,35 +1546,13 @@ OperationFactory::OperationFactory() // TODO Remove ANEURALNETWORKS_GATHER_EX _map[ANEURALNETWORKS_GATHER_EX] = _map[ANEURALNETWORKS_GATHER]; - _map[ANEURALNETWORKS_NEG] = [](const OperationFactory::Param &init_param, Operands &) { - assert(init_param.input_count == 1 && init_param.output_count == 1); - - OperandIndexSequence outputs{init_param.outputs[0]}; - - // Each input should be interpreted as follows: - // - // 0 -> Input Tensor Index - OperandIndexSequence inputs{init_param.inputs[0]}; - - return new operation::Neg{inputs, outputs}; - }; + _map[ANEURALNETWORKS_NEG] = CreateSimpleUnaryOp; // ANEURALNETWORKS_NEG_EX is deprecated // TODO Remove ANEURALNETWORKS_NEG_EX _map[ANEURALNETWORKS_NEG_EX] = _map[ANEURALNETWORKS_NEG]; - _map[ANEURALNETWORKS_ABS] = [](const OperationFactory::Param &init_param, Operands &) { - assert(init_param.input_count == 1 && init_param.output_count == 1); - - OperandIndexSequence outputs{init_param.outputs[0]}; - - // Each input should be interpreted as follows: - // - // 0 -> Input Tensor Index - OperandIndexSequence inputs{init_param.inputs[0]}; - - return new operation::Abs{inputs, outputs}; - }; + _map[ANEURALNETWORKS_ABS] = CreateSimpleUnaryOp; // ANEURALNETWORKS_ABS_EX is deprecated // TODO Remove ANEURALNETWORKS_ABS_EX @@ -1704,18 +1579,7 @@ OperationFactory::OperationFactory() // TODO Remove ANEURALNETWORKS_ARGMAX_EX _map[ANEURALNETWORKS_ARGMAX_EX] = _map[ANEURALNETWORKS_ARGMAX]; - _map[ANEURALNETWORKS_DEQUANTIZE] = [](const OperationFactory::Param &init_param, Operands &) { - assert(init_param.input_count == 1 && init_param.output_count == 1); - - OperandIndexSequence outputs{init_param.outputs[0]}; - - // Each input should be interpreted as follows: - // - // 0 -> Input Tensor Index - OperandIndexSequence inputs{init_param.inputs[0]}; - - return new operation::Dequantize{inputs, outputs}; - }; + _map[ANEURALNETWORKS_DEQUANTIZE] = CreateSimpleUnaryOp; _map[ANEURALNETWORKS_MEAN] = [](const OperationFactory::Param &init_param, Operands &operands) { assert(init_param.input_count == 3 && init_param.output_count == 1); @@ -1841,31 +1705,24 @@ OperationFactory::OperationFactory() }; _map[ANEURALNETWORKS_PAD] = [](const OperationFactory::Param &init_param, Operands &) { - assert(init_param.input_count == 2 && init_param.output_count >= 1); + assert(init_param.input_count >= 2 && init_param.input_count <= 3 && + init_param.output_count >= 1); OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]}; + if (init_param.input_count == 3) + { + inputs.append(OperandIndex{init_param.inputs[2]}); + } OperandIndexSequence outputs{init_param.outputs[0]}; return new operation::Pad{inputs, outputs}; }; - _map[ANEURALNETWORKS_MINIMUM] = [](const OperationFactory::Param &init_param, Operands &) { - assert(init_param.input_count == 2 && init_param.output_count == 1); + _map[ANEURALNETWORKS_PAD_V2] = _map[ANEURALNETWORKS_PAD]; - OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]}; - OperandIndexSequence outputs{init_param.outputs[0]}; + _map[ANEURALNETWORKS_MINIMUM] = createSimpleBinaryOp; - return new operation::Min{inputs, outputs}; - }; - - _map[ANEURALNETWORKS_MAXIMUM] = [](const OperationFactory::Param &init_param, Operands &) { - assert(init_param.input_count == 2 && init_param.output_count == 1); - - OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]}; - OperandIndexSequence outputs{init_param.outputs[0]}; - - return new operation::Max{inputs, outputs}; - }; + _map[ANEURALNETWORKS_MAXIMUM] = createSimpleBinaryOp; _map[ANEURALNETWORKS_ONE_HOT_EX] = [](const OperationFactory::Param &init_param, Operands &operands) { @@ -1948,34 +1805,15 @@ OperationFactory::OperationFactory() return new operation::Range{inputs, outputs}; }; - _map[ANEURALNETWORKS_POW] = [](const OperationFactory::Param &init_param, Operands &) { - assert(init_param.input_count == 2 && init_param.output_count == 1); + // Each input should be interpreted as follows: + // 0 -> LHS Tensor Index + // 1 -> RHS Tensor Index + _map[ANEURALNETWORKS_POW] = createSimpleBinaryOp; - OperandIndexSequence outputs{init_param.outputs[0]}; - - // Each input should be interpreted as follows: - // - // 0 -> LHS Tensor Index - // 1 -> RHS Tensor Index - - OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]}; - - return new operation::Pow{inputs, outputs}; - }; - - _map[ANEURALNETWORKS_FILL_EX] = [](const OperationFactory::Param &init_param, Operands &) { - assert(init_param.input_count == 2 && init_param.output_count == 1); - - // Each input should be interpreted as follows: - // - // 0 -> A tensor, specifying the input. - // 1 -> A 1-D tensor, specifying the value - - OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]}; - OperandIndexSequence outputs{init_param.outputs[0]}; - - return new operation::Fill{inputs, outputs}; - }; + // Each input should be interpreted as follows: + // 0 -> A tensor, specifying the input. + // 1 -> A 1-D tensor, specifying the value + _map[ANEURALNETWORKS_FILL_EX] = createSimpleBinaryOp; _map[ANEURALNETWORKS_ZEROS_LIKE_EX] = [](const OperationFactory::Param &init_param, Operands &) { assert(init_param.input_count == 1 && init_param.output_count == 1); @@ -1989,20 +1827,10 @@ OperationFactory::OperationFactory() return new operation::ZerosLike{inputs, outputs}; }; - _map[ANEURALNETWORKS_TILE] = [](const OperationFactory::Param &init_param, Operands &) { - assert(init_param.input_count == 2 && init_param.output_count == 1); - - OperandIndexSequence outputs{init_param.outputs[0]}; - - // Each input should be interpreted as follows: - // - // 0 -> Input Tensor Index - // 1 -> Multiple Tensor Index - - OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]}; - - return new operation::Tile{inputs, outputs}; - }; + // Each input should be interpreted as follows: + // 0 -> Input Tensor Index + // 1 -> Multiple Tensor Index + _map[ANEURALNETWORKS_TILE] = createSimpleBinaryOp; _map[ANEURALNETWORKS_MATRIX_BAND_PART_EX] = [](const OperationFactory::Param &init_param, Operands &) { @@ -2064,21 +1892,9 @@ OperationFactory::OperationFactory() return new operation::Einsum{inputs, outputs, param}; }; - _map[ANEURALNETWORKS_BROADCAST_TO_EX] = [](const OperationFactory::Param &init_param, - Operands &) { - assert(init_param.input_count == 2 && init_param.output_count == 1); - - OperandIndexSequence outputs{init_param.outputs[0]}; - - // Each input should be interpreted as follows: - // - // 0 -> Input Tensor Index - // 1 -> int32, int64, An 1-D int tensor Index - - OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]}; - - return new operation::BroadcastTo{inputs, outputs}; - }; + // 0 -> Input Tensor Index + // 1 -> int32, int64, An 1-D int tensor Index + _map[ANEURALNETWORKS_BROADCAST_TO_EX] = createSimpleBinaryOp; _map[ANEURALNETWORKS_FUSED_BATCH_NORM_V3_EX] = [](const OperationFactory::Param &init_param, Operands &operands) { @@ -2133,6 +1949,15 @@ OperationFactory::OperationFactory() return new operation::LogSoftmax{inputs, outputs, param}; }; + + _map[ANEURALNETWORKS_QUANTIZE] = [](const OperationFactory::Param &init_param, Operands &) { + assert(init_param.input_count == 1 && init_param.output_count == 1); + + OperandIndexSequence inputs{init_param.inputs[0]}; + OperandIndexSequence outputs{init_param.outputs[0]}; + + return new operation::Quantize{inputs, outputs}; + }; } Operation *OperationFactory::create(ANeuralNetworksOperationType type, diff --git a/runtime/onert/test/core/exec/ExecInstance.cc b/runtime/onert/test/core/exec/ExecInstance.cc index cc04347..0fcf372 100644 --- a/runtime/onert/test/core/exec/ExecInstance.cc +++ b/runtime/onert/test/core/exec/ExecInstance.cc @@ -73,9 +73,8 @@ public: // Compile auto subgs = std::make_shared(); subgs->push(onert::ir::SubgraphIndex{0}, graph); - auto compiler = new onert::compiler::Compiler{subgs}; - executors = compiler->compile(); - delete compiler; + onert::compiler::Compiler compiler{subgs}; + executors = compiler.compile(); } public: @@ -98,19 +97,17 @@ TEST(ExecInstance, simple) float output_buffer[4] = {}; const float output_expected[4] = {5, -2, 0, -1}; - auto execution = new onert::exec::Execution(executors); + onert::exec::Execution execution{executors}; - execution->setInput(input1, reinterpret_cast(input1_buffer), 16); - execution->setInput(input2, reinterpret_cast(input2_buffer), 16); - execution->setOutput(output, reinterpret_cast(output_buffer), 16); - execution->execute(); + execution.setInput(input1, reinterpret_cast(input1_buffer), 16); + execution.setInput(input2, reinterpret_cast(input2_buffer), 16); + execution.setOutput(output, reinterpret_cast(output_buffer), 16); + execution.execute(); for (auto i = 0; i < 4; i++) { EXPECT_EQ(output_buffer[i], output_expected[i]); } - - delete execution; } TEST(ExecInstance, twoCompile) @@ -118,7 +115,7 @@ TEST(ExecInstance, twoCompile) auto mockup = CompiledMockUpModel(); auto graph = mockup.graph; auto executors1 = mockup.executors; - auto execution1 = new onert::exec::Execution(executors1); + onert::exec::Execution execution1{executors1}; auto input1 = IOIndex{0}; auto input2 = IOIndex{1}; @@ -129,38 +126,34 @@ TEST(ExecInstance, twoCompile) float exe1_output_buffer[4] = {}; const float exe1_output_expected[4] = {5, -2, 0, -1}; - execution1->setInput(input1, reinterpret_cast(exe1_input1_buffer), 16); - execution1->setInput(input2, reinterpret_cast(exe1_input2_buffer), 16); - execution1->setOutput(output, reinterpret_cast(exe1_output_buffer), 16); + execution1.setInput(input1, reinterpret_cast(exe1_input1_buffer), 16); + execution1.setInput(input2, reinterpret_cast(exe1_input2_buffer), 16); + execution1.setOutput(output, reinterpret_cast(exe1_output_buffer), 16); // Make new executor: compile again auto subgs = std::make_shared(); subgs->push(onert::ir::SubgraphIndex{0}, graph); - auto compiler = new onert::compiler::Compiler{subgs}; - std::shared_ptr executors2 = compiler->compile(); - auto execution2 = new onert::exec::Execution(executors2); + onert::compiler::Compiler compiler{subgs}; + std::shared_ptr executors2 = compiler.compile(); + onert::exec::Execution execution2{executors2}; const float exe2_input1_buffer[4] = {2, 1, -2, 0}; const float exe2_input2_buffer[4] = {-3, 3, 1, 2}; float exe2_output_buffer[4] = {}; const float exe2_output_expected[4] = {2, 5, -2, 7}; - execution2->setInput(input1, reinterpret_cast(exe2_input1_buffer), 16); - execution2->setInput(input2, reinterpret_cast(exe2_input2_buffer), 16); - execution2->setOutput(output, reinterpret_cast(exe2_output_buffer), 16); + execution2.setInput(input1, reinterpret_cast(exe2_input1_buffer), 16); + execution2.setInput(input2, reinterpret_cast(exe2_input2_buffer), 16); + execution2.setOutput(output, reinterpret_cast(exe2_output_buffer), 16); - execution1->execute(); - execution2->execute(); + execution1.execute(); + execution2.execute(); for (auto i = 0; i < 4; i++) { EXPECT_EQ(exe1_output_buffer[i], exe1_output_expected[i]); EXPECT_EQ(exe2_output_buffer[i], exe2_output_expected[i]); } - - delete compiler; - delete execution1; - delete execution2; } // Support two initialized execution instance then ordered execution @@ -178,32 +171,29 @@ TEST(ExecInstance, twoExecution) const float exe1_output_expected[4] = {5, -2, 0, -1}; const float exe2_output_expected[4] = {2, 5, -2, 7}; - auto execution1 = new onert::exec::Execution(executors); - execution1->setInput(input1, reinterpret_cast(exe1_input1_buffer), 16); - execution1->setInput(input2, reinterpret_cast(exe1_input2_buffer), 16); - execution1->setOutput(output1, reinterpret_cast(exe1_output_buffer), 16); + onert::exec::Execution execution1{executors}; + execution1.setInput(input1, reinterpret_cast(exe1_input1_buffer), 16); + execution1.setInput(input2, reinterpret_cast(exe1_input2_buffer), 16); + execution1.setOutput(output1, reinterpret_cast(exe1_output_buffer), 16); const float exe2_input1_buffer[4] = {2, 1, -2, 0}; const float exe2_input2_buffer[4] = {-3, 3, 1, 2}; float exe2_output_buffer[4] = {}; // Make new execution - auto execution2 = new onert::exec::Execution(executors); - execution2->setInput(input1, reinterpret_cast(exe2_input1_buffer), 16); - execution2->setInput(input2, reinterpret_cast(exe2_input2_buffer), 16); - execution2->setOutput(output1, reinterpret_cast(exe2_output_buffer), 16); + onert::exec::Execution execution2{executors}; + execution2.setInput(input1, reinterpret_cast(exe2_input1_buffer), 16); + execution2.setInput(input2, reinterpret_cast(exe2_input2_buffer), 16); + execution2.setOutput(output1, reinterpret_cast(exe2_output_buffer), 16); - execution1->execute(); - execution2->execute(); + execution1.execute(); + execution2.execute(); for (auto i = 0; i < 4; i++) { EXPECT_EQ(exe1_output_buffer[i], exe1_output_expected[i]); EXPECT_EQ(exe2_output_buffer[i], exe2_output_expected[i]); } - - delete execution1; - delete execution2; } class Inference @@ -222,14 +212,12 @@ public: auto input2 = IOIndex{1}; auto output1 = IOIndex{0}; - auto execution = new onert::exec::Execution(_executors); - execution->setInput(input1, reinterpret_cast(_input1), 16); - execution->setInput(input2, reinterpret_cast(_input2), 16); - execution->setOutput(output1, reinterpret_cast(_output), 16); + onert::exec::Execution execution{_executors}; + execution.setInput(input1, reinterpret_cast(_input1), 16); + execution.setInput(input2, reinterpret_cast(_input2), 16); + execution.setOutput(output1, reinterpret_cast(_output), 16); - execution->execute(); - - delete execution; + execution.execute(); } private: @@ -288,20 +276,18 @@ TEST(ExecInstance, async) float output_buffer[4] = {}; const float output_expected[4] = {5, -2, 0, -1}; - auto execution = new onert::exec::Execution(executors); + onert::exec::Execution execution{executors}; - execution->setInput(input1, reinterpret_cast(input1_buffer), 16); - execution->setInput(input2, reinterpret_cast(input2_buffer), 16); - execution->setOutput(output, reinterpret_cast(output_buffer), 16); - execution->startExecute(); - execution->waitFinish(); + execution.setInput(input1, reinterpret_cast(input1_buffer), 16); + execution.setInput(input2, reinterpret_cast(input2_buffer), 16); + execution.setOutput(output, reinterpret_cast(output_buffer), 16); + execution.startExecute(); + execution.waitFinish(); for (auto i = 0; i < 4; i++) { EXPECT_EQ(output_buffer[i], output_expected[i]); } - - delete execution; } } // namespace diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl index e50b941..005f61c 100644 --- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl +++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl @@ -23,8 +23,8 @@ GeneratedTests.cast_float16_to_quant8 GeneratedTests.cast_float16_to_quant8_overflow GeneratedTests.cast_float32_to_float16 GeneratedTests.cast_float32_to_float16_relaxed +GeneratedTests.cast_float32_to_int32_nnfw GeneratedTests.cast_int32_to_float16 -GeneratedTests.cast_int32_to_quant8_overflow GeneratedTests.cast_quant8_to_float16 GeneratedTests.concat_dynamic_nnfw GeneratedTests.conv_dynamic_nnfw @@ -68,6 +68,7 @@ GeneratedTests.gather_float16_7 GeneratedTests.gather_float16_8 GeneratedTests.greater_dynamic_float_nnfw GeneratedTests.greater_equal_dynamic_float_nnfw +GeneratedTests.l2_normalization_quant8_nnfw GeneratedTests.less_dynamic_float_nnfw GeneratedTests.less_equal_dynamic_float_nnfw GeneratedTests.log_4D_float_nnfw @@ -106,11 +107,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw GeneratedTests.one_hot_ex_dynamic_nnfw GeneratedTests.pack_ex_dynamic_nnfw GeneratedTests.pad_dynamic_nnfw +GeneratedTests.pad_v2_1_float +GeneratedTests.pad_v2_1_quant8 +GeneratedTests.pad_v2_all_dims +GeneratedTests.pad_v2_all_dims_quant8 +GeneratedTests.pad_v2_low_rank +GeneratedTests.pad_v2_low_rank_quant8 GeneratedTests.pow_2D_float_nnfw GeneratedTests.pow_broadcast_float_nnfw GeneratedTests.pow_broadcast_float_nnfw_2 GeneratedTests.pow_broadcast_float_nnfw_3 GeneratedTests.pow_dynamic_nnfw +GeneratedTests.quantize_quant8 +GeneratedTests.quantize_quant8_2 +GeneratedTests.quantize_quant8_3 +GeneratedTests.quantize_quant8_4 +GeneratedTests.quantize_quant8_5 +GeneratedTests.quantize_quant8_6 +GeneratedTests.quantize_quant8_7 +GeneratedTests.quantize_quant8_8 +GeneratedTests.quantize_zero_sized GeneratedTests.range_ex_float_1 GeneratedTests.range_ex_float_1_all_constant_inputs GeneratedTests.range_ex_float_1_dynamic_nnfw diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon index c9edee5..d987bf1 100644 --- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon +++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon @@ -23,10 +23,7 @@ GeneratedTests.cast_float16_to_quant8 GeneratedTests.cast_float16_to_quant8_overflow GeneratedTests.cast_float32_to_float16 GeneratedTests.cast_float32_to_float16_relaxed -GeneratedTests.cast_float32_to_quant8_overflow -GeneratedTests.cast_float32_to_quant8_overflow_relaxed GeneratedTests.cast_int32_to_float16 -GeneratedTests.cast_int32_to_quant8_overflow GeneratedTests.cast_quant8_to_float16 GeneratedTests.concat_dynamic_nnfw GeneratedTests.conv_dynamic_nnfw @@ -73,6 +70,7 @@ GeneratedTests.gather_float16_8 GeneratedTests.greater_dynamic_float_nnfw GeneratedTests.greater_equal_boolean GeneratedTests.greater_equal_dynamic_float_nnfw +GeneratedTests.l2_normalization_quant8_nnfw GeneratedTests.less_boolean GeneratedTests.less_dynamic_float_nnfw GeneratedTests.less_equal_dynamic_float_nnfw @@ -112,11 +110,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw GeneratedTests.one_hot_ex_dynamic_nnfw GeneratedTests.pack_ex_dynamic_nnfw GeneratedTests.pad_dynamic_nnfw +GeneratedTests.pad_v2_1_float +GeneratedTests.pad_v2_1_quant8 +GeneratedTests.pad_v2_all_dims +GeneratedTests.pad_v2_all_dims_quant8 +GeneratedTests.pad_v2_low_rank +GeneratedTests.pad_v2_low_rank_quant8 GeneratedTests.pow_2D_float_nnfw GeneratedTests.pow_broadcast_float_nnfw GeneratedTests.pow_broadcast_float_nnfw_2 GeneratedTests.pow_broadcast_float_nnfw_3 GeneratedTests.pow_dynamic_nnfw +GeneratedTests.quantize_quant8 +GeneratedTests.quantize_quant8_2 +GeneratedTests.quantize_quant8_3 +GeneratedTests.quantize_quant8_4 +GeneratedTests.quantize_quant8_5 +GeneratedTests.quantize_quant8_6 +GeneratedTests.quantize_quant8_7 +GeneratedTests.quantize_quant8_8 +GeneratedTests.quantize_zero_sized GeneratedTests.range_ex_float_1 GeneratedTests.range_ex_float_1_all_constant_inputs GeneratedTests.range_ex_float_1_dynamic_nnfw diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu index 3cce4f3..bc0ae0f 100644 --- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu +++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu @@ -38,9 +38,6 @@ GeneratedTests.gather_float16_8 GeneratedTests.hashtable_lookup_float GeneratedTests.hashtable_lookup_float_4D_nnfw GeneratedTests.hashtable_lookup_quant8 -GeneratedTests.l2_normalization -GeneratedTests.l2_normalization_2 -GeneratedTests.l2_normalization_large GeneratedTests.l2_pool_float GeneratedTests.l2_pool_float_2 GeneratedTests.l2_pool_float_large @@ -79,7 +76,6 @@ GeneratedTests.minimum_simple_quant8 GeneratedTests.neg GeneratedTests.neg_3D_int_nnfw GeneratedTests.neg_4D_int_nnfw -GeneratedTests.pad_quant8_nnfw GeneratedTests.prelu GeneratedTests.prelu_broadcast_float_1_nnfw GeneratedTests.prelu_broadcast_quant8_1_nnfw @@ -94,6 +90,11 @@ GeneratedTests.prelu_weight_as_input_quant8 GeneratedTests.prelu_weight_as_input_quant8_2 GeneratedTests.prelu_weight_as_input_quant8_3 GeneratedTests.prelu_weight_as_input_quant8_4 +GeneratedTests.quantize_quant8_5 +GeneratedTests.quantize_quant8_6 +GeneratedTests.quantize_quant8_7 +GeneratedTests.quantize_quant8_8 +GeneratedTests.quantize_zero_sized GeneratedTests.reduce_max_quant8 GeneratedTests.reduce_max_quant8_1_nnfw GeneratedTests.reduce_max_quant8_2 @@ -125,13 +126,9 @@ GeneratedTests.select_v1_2_one_dim_quant8 GeneratedTests.select_v1_2_two_dim_quant8 GeneratedTests.slice_5 GeneratedTests.slice_6 -GeneratedTests.slice_7 GeneratedTests.slice_8 GeneratedTests.slice_zero_sized GeneratedTests.slice_zero_sized_quant8 -GeneratedTests.space_to_depth_float_1 -GeneratedTests.space_to_depth_float_2 -GeneratedTests.space_to_depth_float_3 GeneratedTests.space_to_depth_quant8_1 GeneratedTests.space_to_depth_quant8_2 GeneratedTests.sqrt_ diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl index e50b941..005f61c 100644 --- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl +++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl @@ -23,8 +23,8 @@ GeneratedTests.cast_float16_to_quant8 GeneratedTests.cast_float16_to_quant8_overflow GeneratedTests.cast_float32_to_float16 GeneratedTests.cast_float32_to_float16_relaxed +GeneratedTests.cast_float32_to_int32_nnfw GeneratedTests.cast_int32_to_float16 -GeneratedTests.cast_int32_to_quant8_overflow GeneratedTests.cast_quant8_to_float16 GeneratedTests.concat_dynamic_nnfw GeneratedTests.conv_dynamic_nnfw @@ -68,6 +68,7 @@ GeneratedTests.gather_float16_7 GeneratedTests.gather_float16_8 GeneratedTests.greater_dynamic_float_nnfw GeneratedTests.greater_equal_dynamic_float_nnfw +GeneratedTests.l2_normalization_quant8_nnfw GeneratedTests.less_dynamic_float_nnfw GeneratedTests.less_equal_dynamic_float_nnfw GeneratedTests.log_4D_float_nnfw @@ -106,11 +107,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw GeneratedTests.one_hot_ex_dynamic_nnfw GeneratedTests.pack_ex_dynamic_nnfw GeneratedTests.pad_dynamic_nnfw +GeneratedTests.pad_v2_1_float +GeneratedTests.pad_v2_1_quant8 +GeneratedTests.pad_v2_all_dims +GeneratedTests.pad_v2_all_dims_quant8 +GeneratedTests.pad_v2_low_rank +GeneratedTests.pad_v2_low_rank_quant8 GeneratedTests.pow_2D_float_nnfw GeneratedTests.pow_broadcast_float_nnfw GeneratedTests.pow_broadcast_float_nnfw_2 GeneratedTests.pow_broadcast_float_nnfw_3 GeneratedTests.pow_dynamic_nnfw +GeneratedTests.quantize_quant8 +GeneratedTests.quantize_quant8_2 +GeneratedTests.quantize_quant8_3 +GeneratedTests.quantize_quant8_4 +GeneratedTests.quantize_quant8_5 +GeneratedTests.quantize_quant8_6 +GeneratedTests.quantize_quant8_7 +GeneratedTests.quantize_quant8_8 +GeneratedTests.quantize_zero_sized GeneratedTests.range_ex_float_1 GeneratedTests.range_ex_float_1_all_constant_inputs GeneratedTests.range_ex_float_1_dynamic_nnfw diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon index 55cfe39..051fbc7 100644 --- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon +++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon @@ -23,10 +23,7 @@ GeneratedTests.cast_float16_to_quant8 GeneratedTests.cast_float16_to_quant8_overflow GeneratedTests.cast_float32_to_float16 GeneratedTests.cast_float32_to_float16_relaxed -GeneratedTests.cast_float32_to_quant8_overflow -GeneratedTests.cast_float32_to_quant8_overflow_relaxed GeneratedTests.cast_int32_to_float16 -GeneratedTests.cast_int32_to_quant8_overflow GeneratedTests.cast_quant8_to_float16 GeneratedTests.concat_dynamic_nnfw GeneratedTests.conv_dynamic_nnfw @@ -73,6 +70,7 @@ GeneratedTests.greater_dynamic_float_nnfw GeneratedTests.greater_equal_boolean GeneratedTests.greater_equal_dynamic_float_nnfw GeneratedTests.less_boolean +GeneratedTests.l2_normalization_quant8_nnfw GeneratedTests.less_dynamic_float_nnfw GeneratedTests.less_equal_dynamic_float_nnfw GeneratedTests.log_4D_float_nnfw @@ -111,11 +109,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw GeneratedTests.one_hot_ex_dynamic_nnfw GeneratedTests.pack_ex_dynamic_nnfw GeneratedTests.pad_dynamic_nnfw +GeneratedTests.pad_v2_1_float +GeneratedTests.pad_v2_1_quant8 +GeneratedTests.pad_v2_all_dims +GeneratedTests.pad_v2_all_dims_quant8 +GeneratedTests.pad_v2_low_rank +GeneratedTests.pad_v2_low_rank_quant8 GeneratedTests.pow_2D_float_nnfw GeneratedTests.pow_broadcast_float_nnfw GeneratedTests.pow_broadcast_float_nnfw_2 GeneratedTests.pow_broadcast_float_nnfw_3 GeneratedTests.pow_dynamic_nnfw +GeneratedTests.quantize_quant8 +GeneratedTests.quantize_quant8_2 +GeneratedTests.quantize_quant8_3 +GeneratedTests.quantize_quant8_4 +GeneratedTests.quantize_quant8_5 +GeneratedTests.quantize_quant8_6 +GeneratedTests.quantize_quant8_7 +GeneratedTests.quantize_quant8_8 +GeneratedTests.quantize_zero_sized GeneratedTests.range_ex_float_1 GeneratedTests.range_ex_float_1_all_constant_inputs GeneratedTests.range_ex_float_1_dynamic_nnfw diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu index 3cce4f3..bc0ae0f 100644 --- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu +++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu @@ -38,9 +38,6 @@ GeneratedTests.gather_float16_8 GeneratedTests.hashtable_lookup_float GeneratedTests.hashtable_lookup_float_4D_nnfw GeneratedTests.hashtable_lookup_quant8 -GeneratedTests.l2_normalization -GeneratedTests.l2_normalization_2 -GeneratedTests.l2_normalization_large GeneratedTests.l2_pool_float GeneratedTests.l2_pool_float_2 GeneratedTests.l2_pool_float_large @@ -79,7 +76,6 @@ GeneratedTests.minimum_simple_quant8 GeneratedTests.neg GeneratedTests.neg_3D_int_nnfw GeneratedTests.neg_4D_int_nnfw -GeneratedTests.pad_quant8_nnfw GeneratedTests.prelu GeneratedTests.prelu_broadcast_float_1_nnfw GeneratedTests.prelu_broadcast_quant8_1_nnfw @@ -94,6 +90,11 @@ GeneratedTests.prelu_weight_as_input_quant8 GeneratedTests.prelu_weight_as_input_quant8_2 GeneratedTests.prelu_weight_as_input_quant8_3 GeneratedTests.prelu_weight_as_input_quant8_4 +GeneratedTests.quantize_quant8_5 +GeneratedTests.quantize_quant8_6 +GeneratedTests.quantize_quant8_7 +GeneratedTests.quantize_quant8_8 +GeneratedTests.quantize_zero_sized GeneratedTests.reduce_max_quant8 GeneratedTests.reduce_max_quant8_1_nnfw GeneratedTests.reduce_max_quant8_2 @@ -125,13 +126,9 @@ GeneratedTests.select_v1_2_one_dim_quant8 GeneratedTests.select_v1_2_two_dim_quant8 GeneratedTests.slice_5 GeneratedTests.slice_6 -GeneratedTests.slice_7 GeneratedTests.slice_8 GeneratedTests.slice_zero_sized GeneratedTests.slice_zero_sized_quant8 -GeneratedTests.space_to_depth_float_1 -GeneratedTests.space_to_depth_float_2 -GeneratedTests.space_to_depth_float_3 GeneratedTests.space_to_depth_quant8_1 GeneratedTests.space_to_depth_quant8_2 GeneratedTests.sqrt_ diff --git a/tests/nnapi/nnapi_gtest.skip.noarch.interp b/tests/nnapi/nnapi_gtest.skip.noarch.interp index 08118ca..069d367 100644 --- a/tests/nnapi/nnapi_gtest.skip.noarch.interp +++ b/tests/nnapi/nnapi_gtest.skip.noarch.interp @@ -188,6 +188,7 @@ GeneratedTests.hashtable_lookup_quant8 GeneratedTests.l2_normalization GeneratedTests.l2_normalization_2 GeneratedTests.l2_normalization_large +GeneratedTests.l2_normalization_quant8_nnfw GeneratedTests.l2_pool_float GeneratedTests.l2_pool_float_2 GeneratedTests.l2_pool_float_large @@ -312,6 +313,12 @@ GeneratedTests.pack_ex_2D_int_2 GeneratedTests.pack_ex_dynamic_nnfw GeneratedTests.pad_dynamic_nnfw GeneratedTests.pad_quant8_nnfw +GeneratedTests.pad_v2_1_float +GeneratedTests.pad_v2_1_quant8 +GeneratedTests.pad_v2_all_dims +GeneratedTests.pad_v2_all_dims_quant8 +GeneratedTests.pad_v2_low_rank +GeneratedTests.pad_v2_low_rank_quant8 GeneratedTests.pow_2D_float_nnfw GeneratedTests.pow_broadcast_float_nnfw GeneratedTests.pow_broadcast_float_nnfw_2 @@ -331,6 +338,15 @@ GeneratedTests.prelu_weight_as_input_quant8 GeneratedTests.prelu_weight_as_input_quant8_2 GeneratedTests.prelu_weight_as_input_quant8_3 GeneratedTests.prelu_weight_as_input_quant8_4 +GeneratedTests.quantize_quant8 +GeneratedTests.quantize_quant8_2 +GeneratedTests.quantize_quant8_3 +GeneratedTests.quantize_quant8_4 +GeneratedTests.quantize_quant8_5 +GeneratedTests.quantize_quant8_6 +GeneratedTests.quantize_quant8_7 +GeneratedTests.quantize_quant8_8 +GeneratedTests.quantize_zero_sized GeneratedTests.range_ex_float_1 GeneratedTests.range_ex_float_1_all_constant_inputs GeneratedTests.range_ex_float_1_dynamic_nnfw diff --git a/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu b/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu index 3cce4f3..bc0ae0f 100644 --- a/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu +++ b/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu @@ -38,9 +38,6 @@ GeneratedTests.gather_float16_8 GeneratedTests.hashtable_lookup_float GeneratedTests.hashtable_lookup_float_4D_nnfw GeneratedTests.hashtable_lookup_quant8 -GeneratedTests.l2_normalization -GeneratedTests.l2_normalization_2 -GeneratedTests.l2_normalization_large GeneratedTests.l2_pool_float GeneratedTests.l2_pool_float_2 GeneratedTests.l2_pool_float_large @@ -79,7 +76,6 @@ GeneratedTests.minimum_simple_quant8 GeneratedTests.neg GeneratedTests.neg_3D_int_nnfw GeneratedTests.neg_4D_int_nnfw -GeneratedTests.pad_quant8_nnfw GeneratedTests.prelu GeneratedTests.prelu_broadcast_float_1_nnfw GeneratedTests.prelu_broadcast_quant8_1_nnfw @@ -94,6 +90,11 @@ GeneratedTests.prelu_weight_as_input_quant8 GeneratedTests.prelu_weight_as_input_quant8_2 GeneratedTests.prelu_weight_as_input_quant8_3 GeneratedTests.prelu_weight_as_input_quant8_4 +GeneratedTests.quantize_quant8_5 +GeneratedTests.quantize_quant8_6 +GeneratedTests.quantize_quant8_7 +GeneratedTests.quantize_quant8_8 +GeneratedTests.quantize_zero_sized GeneratedTests.reduce_max_quant8 GeneratedTests.reduce_max_quant8_1_nnfw GeneratedTests.reduce_max_quant8_2 @@ -125,13 +126,9 @@ GeneratedTests.select_v1_2_one_dim_quant8 GeneratedTests.select_v1_2_two_dim_quant8 GeneratedTests.slice_5 GeneratedTests.slice_6 -GeneratedTests.slice_7 GeneratedTests.slice_8 GeneratedTests.slice_zero_sized GeneratedTests.slice_zero_sized_quant8 -GeneratedTests.space_to_depth_float_1 -GeneratedTests.space_to_depth_float_2 -GeneratedTests.space_to_depth_float_3 GeneratedTests.space_to_depth_quant8_1 GeneratedTests.space_to_depth_quant8_2 GeneratedTests.sqrt_ diff --git a/tests/nnapi/specs/V1_0/l2_normalization_quant8_nnfw.mod.py b/tests/nnapi/specs/V1_0/l2_normalization_quant8_nnfw.mod.py new file mode 100644 index 0000000..ca3770c --- /dev/null +++ b/tests/nnapi/specs/V1_0/l2_normalization_quant8_nnfw.mod.py @@ -0,0 +1,30 @@ +# +# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved +# Copyright (C) 2017 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +model = Model() +in0 = Input("op1", "TENSOR_QUANT8_ASYMM", "{1, 1, 1, 3}, 2e-7, 128") +out0 = Output("op2", "TENSOR_QUANT8_ASYMM", "{1, 1, 1, 3}, 2e-7, 128") +model = model.Operation("L2_NORMALIZATION", in0).To(out0) + +# Example 1. Input in operand 0, +input0 = {in0: # input 0 + [0, 5, 12]} +output0 = {out0: # output 0 + [51, 54, 58]} + +# Instantiate an example +Example((input0, output0)) diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_1_float.mod.py b/tests/nnapi/specs/V1_2/pad_v2_1_float.mod.py similarity index 100% rename from tests/nnapi/specs/skip/V1_2/pad_v2_1_float.mod.py rename to tests/nnapi/specs/V1_2/pad_v2_1_float.mod.py diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_1_quant8.mod.py b/tests/nnapi/specs/V1_2/pad_v2_1_quant8.mod.py similarity index 100% rename from tests/nnapi/specs/skip/V1_2/pad_v2_1_quant8.mod.py rename to tests/nnapi/specs/V1_2/pad_v2_1_quant8.mod.py diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims.mod.py b/tests/nnapi/specs/V1_2/pad_v2_all_dims.mod.py similarity index 100% rename from tests/nnapi/specs/skip/V1_2/pad_v2_all_dims.mod.py rename to tests/nnapi/specs/V1_2/pad_v2_all_dims.mod.py diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims_quant8.mod.py b/tests/nnapi/specs/V1_2/pad_v2_all_dims_quant8.mod.py similarity index 100% rename from tests/nnapi/specs/skip/V1_2/pad_v2_all_dims_quant8.mod.py rename to tests/nnapi/specs/V1_2/pad_v2_all_dims_quant8.mod.py diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank.mod.py b/tests/nnapi/specs/V1_2/pad_v2_low_rank.mod.py similarity index 100% rename from tests/nnapi/specs/skip/V1_2/pad_v2_low_rank.mod.py rename to tests/nnapi/specs/V1_2/pad_v2_low_rank.mod.py diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank_quant8.mod.py b/tests/nnapi/specs/V1_2/pad_v2_low_rank_quant8.mod.py similarity index 100% rename from tests/nnapi/specs/skip/V1_2/pad_v2_low_rank_quant8.mod.py rename to tests/nnapi/specs/V1_2/pad_v2_low_rank_quant8.mod.py diff --git a/tests/nnapi/specs/skip/V1_2/quantize.mod.py b/tests/nnapi/specs/V1_2/quantize.mod.py similarity index 100% rename from tests/nnapi/specs/skip/V1_2/quantize.mod.py rename to tests/nnapi/specs/V1_2/quantize.mod.py diff --git a/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc b/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc index 67f2467..c6c6355 100644 --- a/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc +++ b/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc @@ -51,19 +51,24 @@ TEST_F(ValidationTestAddModelLoaded, output_tensorinfo) ASSERT_EQ(tensor_info.dims[0], 1); } -TEST_F(ValidationTestAddModelLoaded, neg_run_001) +TEST_F(ValidationTestAddModelLoaded, neg_run) { - ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_ERROR); + // nnfw_prepare is not called + ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_INVALID_STATE); } -TEST_F(ValidationTestAddModelLoaded, neg_set_input_001) +TEST_F(ValidationTestAddModelLoaded, neg_set_input) { - ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR); + // nnfw_prepare is not called + ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), + NNFW_STATUS_INVALID_STATE); } -TEST_F(ValidationTestAddModelLoaded, neg_set_output_001) +TEST_F(ValidationTestAddModelLoaded, neg_set_output) { - ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR); + // nnfw_prepare is not called + ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), + NNFW_STATUS_INVALID_STATE); } TEST_F(ValidationTestAddModelLoaded, neg_get_input_size) @@ -81,7 +86,7 @@ TEST_F(ValidationTestAddModelLoaded, neg_load_model) // load model twice ASSERT_EQ(nnfw_load_model_from_file( _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD).c_str()), - NNFW_STATUS_ERROR); + NNFW_STATUS_INVALID_STATE); } TEST_F(ValidationTestAddModelLoaded, neg_output_tensorinfo) diff --git a/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc b/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc index 1bb4182..0f4a4af 100644 --- a/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc +++ b/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc @@ -102,7 +102,7 @@ TEST_F(ValidationTestAddSessionPrepared, neg_run_during_async_run) { SetInOutBuffers(); ASSERT_EQ(nnfw_run_async(_session), NNFW_STATUS_NO_ERROR); - EXPECT_EQ(nnfw_run(_session), NNFW_STATUS_ERROR); + EXPECT_EQ(nnfw_run(_session), NNFW_STATUS_INVALID_STATE); ASSERT_EQ(nnfw_await(_session), NNFW_STATUS_NO_ERROR); } @@ -152,13 +152,13 @@ TEST_F(ValidationTestAddSessionPrepared, neg_load_model) // Load model twice ASSERT_EQ(nnfw_load_model_from_file( _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD).c_str()), - NNFW_STATUS_ERROR); + NNFW_STATUS_INVALID_STATE); } TEST_F(ValidationTestAddSessionPrepared, neg_prepare) { // Call Prepare twice - ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR); + ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE); } // TODO Validation check when "nnfw_run" is called without input & output tensor setting diff --git a/tests/nnfw_api/src/ValidationTestSessionCreated.cc b/tests/nnfw_api/src/ValidationTestSessionCreated.cc index 2675aa7..01832db 100644 --- a/tests/nnfw_api/src/ValidationTestSessionCreated.cc +++ b/tests/nnfw_api/src/ValidationTestSessionCreated.cc @@ -58,7 +58,7 @@ TEST_F(ValidationTestSessionCreated, neg_load_invalid_package_1) nnfw_load_model_from_file( _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD_NO_MANIFEST).c_str()), NNFW_STATUS_ERROR); - ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR); + ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE); } TEST_F(ValidationTestSessionCreated, neg_load_invalid_package_2) @@ -67,52 +67,52 @@ TEST_F(ValidationTestSessionCreated, neg_load_invalid_package_2) _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD_INVALID_MANIFEST).c_str()), NNFW_STATUS_ERROR); - ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR); + ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE); } TEST_F(ValidationTestSessionCreated, neg_prepare_001) { // nnfw_load_model_from_file was not called - ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR); + ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE); } TEST_F(ValidationTestSessionCreated, neg_run_001) { // nnfw_load_model_from_file and nnfw_prepare was not called - ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_ERROR); + ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_INVALID_STATE); } TEST_F(ValidationTestSessionCreated, neg_set_input_001) { - // Invalid state - ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR); + ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), + NNFW_STATUS_INVALID_STATE); } TEST_F(ValidationTestSessionCreated, neg_set_output_001) { - // Invalid state - ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR); + ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), + NNFW_STATUS_INVALID_STATE); } TEST_F(ValidationTestSessionCreated, neg_get_input_size) { uint32_t size = 10000; - ASSERT_EQ(nnfw_input_size(_session, &size), NNFW_STATUS_ERROR); - ASSERT_EQ(size, 10000); + ASSERT_EQ(nnfw_input_size(_session, &size), NNFW_STATUS_INVALID_STATE); + ASSERT_EQ(size, 10000); // Remain unchanged } TEST_F(ValidationTestSessionCreated, neg_get_output_size) { uint32_t size = 10000; - ASSERT_EQ(nnfw_output_size(_session, &size), NNFW_STATUS_ERROR); - ASSERT_EQ(size, 10000); + ASSERT_EQ(nnfw_output_size(_session, &size), NNFW_STATUS_INVALID_STATE); + ASSERT_EQ(size, 10000); // Remain unchanged } TEST_F(ValidationTestSessionCreated, neg_output_tensorinfo) { nnfw_tensorinfo tensor_info; // model is not loaded - ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, &tensor_info), NNFW_STATUS_ERROR); + ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, &tensor_info), NNFW_STATUS_INVALID_STATE); // model is not loaded and tensor_info is null - ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, nullptr), NNFW_STATUS_ERROR); + ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, nullptr), NNFW_STATUS_INVALID_STATE); } diff --git a/tests/scripts/benchmark_nnapi.sh b/tests/scripts/benchmark_nnapi.sh index c7f44c5..af79728 100755 --- a/tests/scripts/benchmark_nnapi.sh +++ b/tests/scripts/benchmark_nnapi.sh @@ -18,7 +18,6 @@ MY_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" source $MY_PATH/common.sh -BENCHMARK_RUN_TEST_SH= BENCHMARK_DRIVER_BIN= BENCHMARK_REPORT_DIR= BENCHMARK_MODELS_FILE= @@ -30,7 +29,7 @@ EXECUTORS="Linear Parallel" #TODO: accept this list as argument function Usage() { - echo "Usage: ./$0 --reportdir=. --runtestsh=tests/scripts/framework/run_test.sh --driverbin=Product/out/bin/tflite_run" + echo "Usage: ./$0 --reportdir=. --driverbin=Product/out/bin/tflite_run" } for i in "$@" @@ -43,9 +42,6 @@ do --test_op) TEST_OP="true" ;; - --runtestsh=*) - BENCHMARK_RUN_TEST_SH=${i#*=} - ;; --driverbin=*) BENCHMARK_DRIVER_BIN=${i#*=} ;; @@ -147,9 +143,8 @@ function run_onert_with_all_config() local REPORT_MODEL_DIR=$2 local PAUSE_TIME_IN_SEC=$3 local BENCHMARK_DRIVER_BIN=$4 - local BENCHMARK_RUN_TEST_SH=$5 - local EXECUTORS=$6 - local BACKEND_LIST=$7 + local EXECUTORS=$5 + local BACKEND_LIST=$6 export USE_NNAPI=1 @@ -163,18 +158,18 @@ function run_onert_with_all_config() done export BACKENDS=$BACKENDS_TO_USE if [ "$TEST_OP" == "false" ]; then - profile_for_he_shed $REPORT_MODEL_DIR $BENCHMARK_RUN_TEST_SH $BENCHMARK_DRIVER_BIN $MODEL $PROFILING_RUN_CNT + profile_for_he_shed $REPORT_MODEL_DIR $BENCHMARK_DRIVER_BIN $MODEL $PROFILING_RUN_CNT fi for executor in $EXECUTORS; do export EXECUTOR=$executor if [ "$TEST_OP" == "false" ]; then - run_with_he_scheduler $REPORT_MODEL_DIR $BENCHMARK_RUN_TEST_SH $BENCHMARK_DRIVER_BIN $MODEL $executor + run_with_he_scheduler $REPORT_MODEL_DIR $BENCHMARK_DRIVER_BIN $MODEL $executor fi for backend in $BACKEND_LIST; do export OP_BACKEND_ALLOPS=$backend run_benchmark_and_print "tflite_onert_"$executor"_executor_$backend" "TFLite onert $executor Executor $backend"\ - $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH + $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN done done unset USE_NNAPI EXECUTOR OP_BACKEND_ALLOPS BACKENDS @@ -215,14 +210,14 @@ function run_benchmark_test() # TFLite+CPU unset USE_NNAPI - run_benchmark_and_print "tflite_cpu" "TFLite CPU" $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH + run_benchmark_and_print "tflite_cpu" "TFLite CPU" $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN # run onert if [ "$TEST_OP" == "true" ]; then # Operation test don't need to test each scheduler - run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH "Linear" "$BACKEND_LIST" + run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN "Linear" "$BACKEND_LIST" else - run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH "$EXECUTORS" "$BACKEND_LIST" + run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN "$EXECUTORS" "$BACKEND_LIST" fi if [[ $i -ne $(echo $BENCHMARK_MODEL_LIST | wc -w)-1 ]]; then diff --git a/tests/scripts/common.sh b/tests/scripts/common.sh index 8800290..b2799c2 100755 --- a/tests/scripts/common.sh +++ b/tests/scripts/common.sh @@ -18,13 +18,12 @@ MY_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" function get_result_of_benchmark_test() { - local RUN_TEST_SH=$1 - local DRIVER_BIN=$2 - local MODEL=$3 - local LOG_FILE=$4 + local DRIVER_BIN=$1 + local MODEL=$2 + local LOG_FILE=$3 local RET=0 - $RUN_TEST_SH --driverbin="$DRIVER_BIN -r 5 -w 3" $MODEL > $LOG_FILE 2>&1 + $MY_PATH/framework/run_test.sh --driverbin="$DRIVER_BIN -r 5 -w 3" $MODEL > $LOG_FILE 2>&1 RET=$? if [[ $RET -ne 0 ]]; then echo "Testing $MODEL aborted... exit code: $RET" @@ -68,7 +67,7 @@ function run_benchmark_and_print() LOG_FILE=$REPORT_MODEL_DIR/$WRITE_FILE_NAME.txt RESULT_FILE=$REPORT_MODEL_DIR/$WRITE_FILE_NAME.result print_with_dots $MSG - RESULT=$(get_result_of_benchmark_test $BENCHMARK_RUN_TEST_SH $DRIVER_BIN $MODEL $LOG_FILE) + RESULT=$(get_result_of_benchmark_test $DRIVER_BIN $MODEL $LOG_FILE) echo "$RESULT ms" print_result_of_benchmark_test "$MSG" "$RESULT" $RESULT_FILE sleep $PAUSE_TIME_IN_SEC diff --git a/tests/scripts/framework/run_test.sh b/tests/scripts/framework/run_test.sh index 44b7149..9440c52 100755 --- a/tests/scripts/framework/run_test.sh +++ b/tests/scripts/framework/run_test.sh @@ -28,10 +28,12 @@ function Usage() echo "Usage: ./$0 --driverbin={such as tflite_run} {tests to test or empty for all of tests}" echo "Usage: ./$0 --driverbin=Product/out/bin/tflite_run --reportdir=report --tapname=verification.tap avgpool1 avgpool2" echo "" - echo "--download - (default=off) Download model files. Other options is ignored" - echo "--driverbin - (default=../../Product/out/bin/tflite_run) runner for runnning framework tests" - echo "--reportdir - (default=report) directory to place tap files" - echo "--tapname - (default=framework_test.tap) file name to be written for tap" + echo "--download - (default=on) Download model files" + echo "--run - (default=on) Test model files" + echo "--driverbin - (default=../../Product/out/bin/tflite_run) Runner for runnning model tests" + echo "--reportdir - (default=report) Directory to place tap files" + echo "--tapname - (default=framework_test.tap) File name to be written for tap" + echo "--md5 - (default=on) MD5 check when download model files" echo "" } @@ -43,9 +45,13 @@ function need_download() return 0; fi # Ignore checking md5 in cache + # TODO Use "--md5" option only and remove IGNORE_MD5 environment variable if [ ! -z $IGNORE_MD5 ] && [ "$IGNORE_MD5" == "1" ]; then return 1 fi + if [ "$MD5_CHECK" = "off" ]; then + return 1 + fi LOCAL_HASH=$(md5sum $LOCAL_PATH | awk '{ print $1 }') REMOTE_HASH=$(curl -ss $REMOTE_URL | md5sum | awk '{ print $1 }') @@ -60,7 +66,9 @@ function need_download() DRIVER_BIN="" TAP_NAME="framework_test.tap" TEST_LIST=() -DOWNLOAD_MODE="off" +DOWNLOAD_MODEL="on" +RUN_TEST="on" +MD5_CHECK="on" # Support environment variable setting for mirror server FIXED_MODELFILE_SERVER="${MODELFILE_SERVER:-}" @@ -84,6 +92,12 @@ do --download=*) DOWNLOAD_MODE=${i#*=} ;; + --md5=*) + MD5_CHECK=${i#*=} + ;; + --run=*) + RUN_TEST=${i#*=} + ;; *) TEST_LIST+=( $i ) ;; @@ -100,7 +114,7 @@ if [ ! -n "$DRIVER_BIN" ]; then fi # Check test driver setting -if [ ! -e $DRIVER_BIN ] && [ "$DOWNLOAD_MODE" != "on" ]; then +if [ ! -e $DRIVER_BIN ] && [ "$RUN_TEST" = "on" ]; then echo "Cannot find test driver" $DRIVER_BIN ": please set proper DRIVER_BIN" exit 1 fi @@ -139,33 +153,9 @@ run_tests() TEST_CACHE_PATH=$CACHE_ROOT_PATH/$TEST_NAME MODELFILE=$TEST_CACHE_PATH/$MODELFILE_NAME - MODELFILE_URL="$MODELFILE_SERVER_PATH/$MODELFILE_NAME" - if [ -n "$FIXED_MODELFILE_SERVER" ]; then - MODELFILE_URL="$FIXED_MODELFILE_SERVER/$MODELFILE_NAME" - fi - - # Download model file - if [ ! -e $TEST_CACHE_PATH ]; then - mkdir -p $TEST_CACHE_PATH - fi - - # Download unless we have it in cache (Also check md5sum) - if need_download "$MODELFILE" "$MODELFILE_URL"; then - echo "" - echo "Download test file for $TEST_NAME" - echo "======================" - - rm -f $MODELFILE # Remove invalid file if exists - pushd $TEST_CACHE_PATH - wget -nv $MODELFILE_URL - if [ "${MODELFILE_NAME##*.}" == "zip" ]; then - unzip -o $MODELFILE_NAME - fi - popd - fi # Find model file for downloaded by zip - if [ "${MODELFILE_NAME##*.}" == "zip" ]; then + if [ "${MODELFILE_NAME##*.}" = "zip" ]; then pushd $TEST_CACHE_PATH MODELFILE=$TEST_CACHE_PATH/$(ls *.tflite) popd @@ -178,7 +168,6 @@ run_tests() # Run driver to test framework $DRIVER_BIN $MODELFILE - #$DRIVER_BIN $MODELFILE if [[ $? -eq 0 ]]; then echo "ok $i - $TEST_NAME" >> $REPORT_DIR/$TAP_NAME else @@ -268,10 +257,11 @@ find_tests() mkdir -p $REPORT_DIR TESTS_TO_RUN=$(find_tests ${TEST_LIST[@]}) -if [[ "$DOWNLOAD_MODE" == "on" ]]; then +if [ "$DOWNLOAD_MODEL" = "on" ]; then download_tests $TESTS_TO_RUN - exit 0; fi -run_tests $TESTS_TO_RUN +if [ "$RUN_TEST" = "on" ]; then + run_tests $TESTS_TO_RUN +fi exit $? diff --git a/tests/scripts/test-driver.sh b/tests/scripts/test-driver.sh index 615fc2c..a720b15 100755 --- a/tests/scripts/test-driver.sh +++ b/tests/scripts/test-driver.sh @@ -38,7 +38,6 @@ function Usage() echo "etc." echo "--framework_driverbin - (default=../../Product/out/bin/tflite_run) runner for runnning framework tests" echo "--verification_driverbin - (default=../../Product/out/bin/nnapi_test) runner for runnning verification tests" - echo "--runtestsh - (default=\$ARTIFACT_PATH/tests/scripts/framework/run_test.sh) run_test.sh with path where it is for framework test and verification" echo "--unittestdir - (default=\$ARTIFACT_PATH/Product/out/unittest) directory that has unittest binaries for unit test" echo "" echo "--reportdir - (default=\$ARTIFACT_PATH/report) directory to save report" @@ -49,7 +48,6 @@ TEST_DRIVER_DIR="$( cd "$( dirname "${BASH_SOURCE}" )" && pwd )" ARTIFACT_PATH="$TEST_DRIVER_DIR/../../" FRAMEWORK_DRIVER_BIN="" VERIFICATION_DRIVER_BIN="" -RUN_TEST_SH="" UNIT_TEST_DIR="" ALLTEST_ON="true" UNITTEST_ON="false" @@ -74,9 +72,6 @@ do --verification_driverbin=*) VERIFICATION_DRIVER_BIN=${i#*=} ;; - --runtestsh=*) - RUN_TEST_SH=${i#*=} - ;; --unittestdir=*) UNIT_TEST_DIR=${i#*=} ;; @@ -116,15 +111,6 @@ done ARTIFACT_PATH="$(readlink -f $ARTIFACT_PATH)" -if [ -z "$RUN_TEST_SH" ]; then - RUN_TEST_SH=$ARTIFACT_PATH/tests/scripts/framework/run_test.sh -fi - -if [ ! -e "$RUN_TEST_SH" ]; then - echo "Cannot find $RUN_TEST_SH" - exit 1 -fi - if [ -z "$UNIT_TEST_DIR" ]; then UNIT_TEST_DIR=$ARTIFACT_PATH/Product/out/unittest fi @@ -149,7 +135,6 @@ if [ "$FRAMEWORKTEST_ON" == "true" ]; then fi $TEST_DRIVER_DIR/test_framework.sh \ - --runtestsh=$RUN_TEST_SH \ --driverbin=$FRAMEWORK_DRIVER_BIN \ --reportdir=$REPORT_DIR \ --tapname=framework_test.tap \ @@ -166,7 +151,6 @@ if [ "$ALLTEST_ON" == "true" ] || [ "$VERIFICATION_ON" == "true" ]; then # verification uses the same script as frameworktest does $TEST_DRIVER_DIR/test_framework.sh \ - --runtestsh=$RUN_TEST_SH \ --driverbin=$VERIFICATION_DRIVER_BIN \ --reportdir=$REPORT_DIR \ --tapname=verification_test.tap \ @@ -180,7 +164,6 @@ if [ "$BENCHMARK_ONERT_OP_ON" == "true" ]; then $TEST_DRIVER_DIR/benchmark_nnapi.sh \ --test_op \ - --runtestsh=$RUN_TEST_SH \ --driverbin=$DRIVER_BIN \ --reportdir=$REPORT_DIR/benchmark_op \ --modelfilepath=$ARTIFACT_PATH/tests/scripts/framework diff --git a/tests/scripts/test_framework.sh b/tests/scripts/test_framework.sh index 1d97515..bd86cd3 100755 --- a/tests/scripts/test_framework.sh +++ b/tests/scripts/test_framework.sh @@ -14,7 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -FWTEST_RUN_TEST_SH= +MY_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + FWTEST_DRIVER_BIN= FWTEST_REPORT_DIR= FWTEST_TAP_NAME= @@ -25,7 +26,6 @@ function Usage() { echo "Usage Example:" echo "./$0 \\" - echo " --runtestsh=tests/scripts/framework/run_test.sh \\ # Test runner script path" echo " --driverbin=Product/out/bin/tflite_run \\ # Test driver path" echo " --frameworktest_list_file=tests/scripts/list/frameworktest_list.armv7l.cpu.txt \\" echo " --reportdir=report \\ # Directory for the report files will be saved" @@ -42,9 +42,6 @@ do -h|--help|help) Usage ;; - --runtestsh=*) - FWTEST_RUN_TEST_SH=${i#*=} - ;; --driverbin=*) FWTEST_DRIVER_BIN=${i#*=} ;; @@ -67,7 +64,6 @@ do shift done -[ ! -z "$FWTEST_RUN_TEST_SH" ] || Usage [ ! -z "$FWTEST_DRIVER_BIN" ] || Usage [ ! -z "$FWTEST_REPORT_DIR" ] || Usage [ ! -z "$FWTEST_TAP_NAME" ] || Usage @@ -86,7 +82,7 @@ if [ ! -z "$FRAMEWORKTEST_LIST_FILE" ]; then MODELLIST=$(cat "${FRAMEWORKTEST_LIST_FILE}") fi -$FWTEST_RUN_TEST_SH --driverbin=$FWTEST_DRIVER_BIN \ +$MY_PATH/framework/run_test.sh --driverbin=$FWTEST_DRIVER_BIN \ --reportdir=$FWTEST_REPORT_DIR \ --tapname=$FWTEST_TAP_NAME \ ${MODELLIST:-} \ diff --git a/tests/tools/nnpackage_run/CMakeLists.txt b/tests/tools/nnpackage_run/CMakeLists.txt index 0e333a0..ec45db4 100644 --- a/tests/tools/nnpackage_run/CMakeLists.txt +++ b/tests/tools/nnpackage_run/CMakeLists.txt @@ -33,7 +33,7 @@ target_include_directories(nnpackage_run PRIVATE src) target_include_directories(nnpackage_run PRIVATE ${Boost_INCLUDE_DIRS}) target_link_libraries(nnpackage_run onert_core onert tflite_loader) -target_link_libraries(nnpackage_run tensorflow-lite ${LIB_PTHREAD} dl nnfw_lib_tflite jsoncpp) +target_link_libraries(nnpackage_run nnfw_lib_tflite jsoncpp) target_link_libraries(nnpackage_run nnfw-dev) target_link_libraries(nnpackage_run ${Boost_PROGRAM_OPTIONS_LIBRARY}) target_link_libraries(nnpackage_run nnfw_lib_benchmark) diff --git a/tests/tools/nnpackage_run/src/args.cc b/tests/tools/nnpackage_run/src/args.cc index 0dbcafc..cb4a7db 100644 --- a/tests/tools/nnpackage_run/src/args.cc +++ b/tests/tools/nnpackage_run/src/args.cc @@ -16,6 +16,7 @@ #include "args.h" +#include #include #include @@ -105,6 +106,75 @@ Args::Args(const int argc, char **argv) void Args::Initialize(void) { + auto process_nnpackage = [&](const std::string &package_filename) { + _package_filename = package_filename; + + std::cerr << "Package Filename " << _package_filename << std::endl; + if (_package_filename.empty()) + { + // TODO Print usage instead of the below message + std::cerr << "Please specify nnpackage file. Run with `--help` for usage." + << "\n"; + + exit(1); + } + else + { + if (access(_package_filename.c_str(), F_OK) == -1) + { + std::cerr << "nnpackage not found: " << _package_filename << "\n"; + } + } + }; + + auto process_output_sizes = [&](const std::string &output_sizes_json_str) { + Json::Value root; + Json::Reader reader; + if (!reader.parse(output_sizes_json_str, root, false)) + { + std::cerr << "Invalid JSON format for output_sizes \"" << output_sizes_json_str << "\"\n"; + exit(1); + } + + auto arg_map = argArrayToMap(root); + for (auto &pair : arg_map) + { + uint32_t key = pair.first; + Json::Value &val_json = pair.second; + if (!val_json.isUInt()) + { + std::cerr << "All the values in `output_sizes` must be unsigned integers\n"; + exit(1); + } + uint32_t val = val_json.asUInt(); + _output_sizes[key] = val; + } + }; + + auto process_shape_prepare = [&](const std::string &shape_str) { + try + { + handleShapeParam(_shape_prepare, shape_str); + } + catch (const std::exception &e) + { + std::cerr << "error with '--shape_prepare' option: " << shape_str << std::endl; + exit(1); + } + }; + + auto process_shape_run = [&](const std::string &shape_str) { + try + { + handleShapeParam(_shape_run, shape_str); + } + catch (const std::exception &e) + { + std::cerr << "error with '--shape_run' option: " << shape_str << std::endl; + exit(1); + } + }; + // General options po::options_description general("General options", 100); @@ -112,32 +182,33 @@ void Args::Initialize(void) general.add_options() ("help,h", "Print available options") ("version", "Print version and exit immediately") - ("nnpackage", po::value()->required()) + ("nnpackage", po::value()->required()->notifier(process_nnpackage)) #if defined(ONERT_HAVE_HDF5) && ONERT_HAVE_HDF5 == 1 - ("dump,d", po::value()->default_value(""), "Output filename") - ("load,l", po::value()->default_value(""), "Input filename") + ("dump,d", po::value()->default_value("")->notifier([&](const auto &v) { _dump_filename = v; }), "Output filename") + ("load,l", po::value()->default_value("")->notifier([&](const auto &v) { _load_filename = v; }), "Input filename") #endif - ("output_sizes", po::value(), + ("output_sizes", po::value()->notifier(process_output_sizes), "The output buffer size in JSON 1D array\n" "If not given, the model's output sizes are used\n" "e.g. '[0, 40, 2, 80]' to set 0th tensor to 40 and 2nd tensor to 80.\n") - ("num_runs,r", po::value()->default_value(1), "The number of runs") - ("warmup_runs,w", po::value()->default_value(0), "The number of warmup runs") - ("run_delay,t", po::value()->default_value(-1), "Delay time(ms) between runs (as default no delay") - ("gpumem_poll,g", po::value()->default_value(false), "Check gpu memory polling separately") - ("mem_poll,m", po::value()->default_value(false), "Check memory polling") - ("write_report,p", po::value()->default_value(false), + ("num_runs,r", po::value()->default_value(1)->notifier([&](const auto &v) { _num_runs = v; }), "The number of runs") + ("warmup_runs,w", po::value()->default_value(0)->notifier([&](const auto &v) { _warmup_runs = v; }), "The number of warmup runs") + ("run_delay,t", po::value()->default_value(-1)->notifier([&](const auto &v) { _run_delay = v; }), "Delay time(ms) between runs (as default no delay") + ("gpumem_poll,g", po::value()->default_value(false)->notifier([&](const auto &v) { _gpumem_poll = v; }), "Check gpu memory polling separately") + ("mem_poll,m", po::value()->default_value(false)->notifier([&](const auto &v) { _mem_poll = v; }), "Check memory polling") + ("write_report,p", po::value()->default_value(false)->notifier([&](const auto &v) { _write_report = v; }), "Write report\n" "{exec}-{nnpkg}-{backend}.csv will be generated.\n" "e.g. nnpackage_run-UNIT_Add_000-acl_cl.csv.\n" "{nnpkg} name may be changed to realpath if you use symbolic-link.") - ("shape_prepare", po::value()->default_value("[]"), + ("shape_prepare", po::value()->default_value("[]")->notifier(process_shape_prepare), "set shape of specified tensor before compilation\n" "e.g. '[0, [1, 2], 2, []]' to set 0th tensor to [1, 2] and 2nd tensor to [].\n") - ("shape_run", po::value()->default_value("[]"), + ("shape_run", po::value()->default_value("[]")->notifier(process_shape_run), "set shape of specified tensor right before running\n" "e.g. '[1, [1, 2]]` to set 1st tensor to [1, 2].\n") - ("verbose_level,v", po::value()->default_value(0), "Verbose level\n" + ("verbose_level,v", po::value()->default_value(0)->notifier([&](const auto &v) { _verbose_level = v; }), + "Verbose level\n" "0: prints the only result. Messages btw run don't print\n" "1: prints result and message btw run\n" "2: prints all of messages to print\n") @@ -180,158 +251,23 @@ void Args::Parse(const int argc, char **argv) return; } - po::notify(vm); try { -#if defined(ONERT_HAVE_HDF5) && ONERT_HAVE_HDF5 == 1 - if (vm.count("dump")) - { - _dump_filename = vm["dump"].as(); - } - - if (vm.count("load")) - { - _load_filename = vm["load"].as(); - } -#endif - - if (vm.count("nnpackage")) - { - _package_filename = vm["nnpackage"].as(); - - if (_package_filename.empty()) - { - // TODO Print usage instead of the below message - std::cerr << "Please specify nnpackage file. Run with `--help` for usage." - << "\n"; - - exit(1); - } - else - { - if (access(_package_filename.c_str(), F_OK) == -1) - { - std::cerr << "nnpackage not found: " << _package_filename << "\n"; - } - } - } - - if (vm.count("output_sizes")) - { - auto output_sizes_json_str = vm["output_sizes"].as(); - - Json::Value root; - Json::Reader reader; - if (!reader.parse(output_sizes_json_str, root, false)) - { - std::cerr << "Invalid JSON format for output_sizes \"" << output_sizes_json_str << "\"\n"; - exit(1); - } - - auto arg_map = argArrayToMap(root); - for (auto &pair : arg_map) - { - uint32_t key = pair.first; - Json::Value &val_json = pair.second; - if (!val_json.isUInt()) - { - std::cerr << "All the values in `output_sizes` must be unsigned integers\n"; - exit(1); - } - uint32_t val = val_json.asUInt(); - _output_sizes[key] = val; - } - } - - if (vm.count("num_runs")) - { - _num_runs = vm["num_runs"].as(); - } - - if (vm.count("warmup_runs")) - { - _warmup_runs = vm["warmup_runs"].as(); - } - - if (vm.count("run_delay")) - { - _run_delay = vm["run_delay"].as(); - } - - if (vm.count("gpumem_poll")) - { - _gpumem_poll = vm["gpumem_poll"].as(); - } - - if (vm.count("mem_poll")) - { - _mem_poll = vm["mem_poll"].as(); - // Instead of EXECUTE to avoid overhead, memory polling runs on WARMUP - if (_mem_poll && _warmup_runs == 0) - { - _warmup_runs = 1; - } - } - - if (vm.count("write_report")) - { - _write_report = vm["write_report"].as(); - } - - if (vm.count("verbose_level")) - { - _verbose_level = vm["verbose_level"].as(); - } + po::notify(vm); } catch (const std::bad_cast &e) { - std::cerr << "error by bad cast" << e.what() << '\n'; + std::cerr << "Bad cast error - " << e.what() << '\n'; exit(1); } - if (vm.count("shape_prepare")) - { - std::string shape_str; - try - { - shape_str = vm["shape_prepare"].as(); - } - catch (const std::bad_cast &e) - { - std::cerr << "error by bad cast with '--shape_prepare' option" << e.what() << '\n'; - exit(1); - } - try - { - handleShapeParam(_shape_prepare, shape_str); - } - catch (const std::exception &e) - { - std::cerr << "error with '--shape_prepare' option: " << shape_str << std::endl; - exit(1); - } - } - - if (vm.count("shape_run")) + // This must be run after `notify` as `_warm_up_runs` must have been processed before. + if (vm.count("mem_poll")) { - std::string shape_str; - try - { - shape_str = vm["shape_run"].as(); - } - catch (const std::bad_cast &e) + // Instead of EXECUTE to avoid overhead, memory polling runs on WARMUP + if (_mem_poll && _warmup_runs == 0) { - std::cerr << "error by bad cast with '--shape_run' option" << e.what() << '\n'; - exit(1); - } - try - { - handleShapeParam(_shape_run, shape_str); - } - catch (const std::exception &e) - { - std::cerr << "error with '--shape_run' option: " << shape_str << std::endl; - exit(1); + _warmup_runs = 1; } } } diff --git a/tests/tools/nnpackage_run/src/h5formatter.cc b/tests/tools/nnpackage_run/src/h5formatter.cc index 34c075c..09ace47 100644 --- a/tests/tools/nnpackage_run/src/h5formatter.cc +++ b/tests/tools/nnpackage_run/src/h5formatter.cc @@ -145,6 +145,7 @@ void H5Formatter::dumpOutputs(const std::string &filename, std::vector